## Methods: 基于蛋白结构的反应预测
> 2024-12-20

### 1. 导入必要的包

In [9]:
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tqdm import tqdm
from tools import btools
from tools import filetool
from tools import bioFunctionLib as bfl 
import rxnrecer as production
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
import subprocess

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. Load test data

In [3]:
def load_10folds_data(type='test'):
    if type == 'test':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/valid.feather' for fold_num in range(10)]
        
    if type =='train':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/train.feather' for fold_num in range(10)]
        
        
    res = [pd.read_feather(path)[['uniprot_id','seq','reaction_id']].rename(columns={'reaction_id': 'rxn_groundtruth'}) for path in file_path]
        
    return res


print('Loading pdbfile ...' )
data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)


print('Loading 10-folds  data ...' )
data_trian = load_10folds_data(type='train')
data_test = load_10folds_data(type='test')


Loading pdbfile ...
Loading 10-folds  data ...


### 3. MakeDBs

In [34]:

# trains_ids = [item.uniprot_id.to_list() for item in data_trian]
# for i in tqdm(range(10)):
#     fold_num=i
#     db_name = f'fold{i+1}'
#     prep_df = data_pdb[data_pdb.uniprot_id.isin(trains_ids[fold_num]).reset_index(drop=True)]
#     bfl.make_foldseek_db(prp_df=prep_df, db_name=db_name)

## 4. Make testDBS

In [12]:
# test_ids = [item.uniprot_id.to_list() for item in data_test]
# for i in tqdm(range(10)):
#     fold_num=i
#     db_name = f'fold{i+1}'
#     prep_df = data_pdb[data_pdb.uniprot_id.isin(test_ids[fold_num]).reset_index(drop=True)]
#     bfl.gather_test_pdb_db(prp_df=prep_df, db_name=db_name)


## 5. 执行比对脚本

In [13]:
# fold_num = 1
# fseek_cols='query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore'
# fseek_cmd =f'time foldseek easy-search {cfg.DIR_FOLDSEEK_PDB}fold{fold_num}/pdb_test/ {cfg.DIR_FOLDSEEK_PDB}fold1/DB/fold1 \
# {cfg.RESULTS_DIR}/structural/foldseek_res_fold{fold_num}.m8 {cfg.TEMP_DIR} --alignment-type 0 --format-output {fseek_cols} \
# --tmscore-threshold 0.2 --tmalign-hit-order 0 --max-seqs 1000'

# fseek_cmd

In [6]:
# Writing the SLURM script to a file named 's3-1_structure_foldseek.slurm'

script_content = """#!/bin/bash
#SBATCH --job-name=foldseek_batch                       # 作业名称
#SBATCH --partition=qcpu_23i                            # 队列名称
#SBATCH --nodes=1                                       # 每个作业使用一个节点
#SBATCH --ntasks-per-node=1                             # 每个节点的任务数（单任务）
#SBATCH --cpus-per-task=63                              # 每个任务使用的 CPU 核心数
#SBATCH --mem=500G                                      # 每个节点使用的内存
#SBATCH --array=0-9                                     # 数组任务索引范围
#SBATCH --output=slurmlog/RXNRECer-FOLDSEEK-%A_%a.out   # 标准输出文件
#SBATCH --error=slurmlog/RXNRECer-FOLDSEEK-%A_%a.err    # 标准错误文件

# 配置变量
FOLD_NUM=$((SLURM_ARRAY_TASK_ID + 1)) # 将数组索引转换为 fold_num (1 到 10)
CFG_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/foldseek"
RESULTS_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/structural"
BASE_TEMP_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp"
TEMP_DIR="${BASE_TEMP_DIR}/task_${SLURM_ARRAY_TASK_ID}" # 动态临时目录
FSEEK_COLS="query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore"

# 创建日志目录（如果不存在）
mkdir -p slurmlog

# 为当前任务创建独立的临时目录
mkdir -p ${TEMP_DIR}

# 输出任务开始信息
echo "Job started: $(date)"
echo "Running foldseek for fold_num=${FOLD_NUM} with ${SLURM_CPUS_PER_TASK} threads"
echo "Task is running on node: $(hostname)"
echo "Temporary directory: ${TEMP_DIR}"

# 设置 OpenMP 线程数
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}

# 运行命令
time foldseek easy-search ${CFG_DIR}/fold${FOLD_NUM}/pdb_test/ ${CFG_DIR}/fold1/DB/fold1 \
${RESULTS_DIR}/foldseek_res_fold${FOLD_NUM}.m8 ${TEMP_DIR} \
--alignment-type 0 --format-output ${FSEEK_COLS} \
--tmscore-threshold 0.2 --tmalign-hit-order 0 --max-seqs 1000

# 清理任务临时目录（可选，如果需要保留临时文件，注释掉以下行）
rm -rf ${TEMP_DIR}
echo "Deleted temporary directory: ${TEMP_DIR}"

# 输出任务结束信息
echo "Job ended: $(date)"
"""

# file_path = "s3-1_structure_foldseek.slurm"

# with open(file_path, "w") as file:
#     file.write(script_content)

# file_path

# ! sbatch s3-1_structure_foldseek.slurm

### 3. Load results


In [10]:
tdi_embd = pd.read_feather(f'{cfg.TEMP_DIR}protein3di_embd.feater')

In [11]:
tdi_embd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.060791,0.036926,0.004887,0.139893,-0.081238,-0.092773,-0.013260,-0.079285,0.012558,0.121826,...,0.089233,-0.194824,-0.027512,0.045959,0.028366,-0.014610,0.129761,0.118103,-0.089478,0.021271
1,0.112366,0.086243,0.116150,0.080078,-0.035919,0.014214,0.084595,-0.002216,-0.021530,-0.069702,...,0.104919,-0.071777,0.054657,0.076111,0.151978,-0.004089,0.209839,0.090820,-0.008339,0.112244
2,0.175171,0.059509,0.090027,0.138062,0.040680,-0.006828,0.152222,-0.248657,-0.003510,-0.288330,...,0.036194,-0.134033,0.105774,-0.005856,0.188843,0.015099,0.022690,0.152954,-0.039093,0.038696
3,0.242065,0.116089,0.049072,0.156616,0.002560,-0.038177,0.136230,0.034241,-0.061646,0.080261,...,0.109070,-0.114258,0.009323,0.053284,0.270752,0.045532,0.026443,0.080750,0.032501,0.019333
4,0.120422,0.073120,0.074097,0.001993,-0.028976,-0.145264,-0.036926,0.025620,0.020111,-0.140869,...,-0.049530,-0.195923,0.003269,-0.009521,0.033173,-0.031464,0.093811,0.013428,-0.108215,-0.048767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522097,0.049011,0.074463,-0.060455,-0.118164,0.083069,-0.017487,0.104126,-0.068176,-0.011765,-0.056732,...,-0.015450,-0.170654,0.049561,-0.071411,0.072388,-0.014122,-0.062378,0.059753,-0.229492,-0.018616
522098,0.055267,0.067810,-0.049500,-0.094360,0.072083,-0.008217,0.103149,-0.070129,-0.002375,-0.067078,...,-0.019424,-0.180054,0.055420,-0.071899,0.071045,0.001160,-0.061859,0.065857,-0.243652,-0.004753
522099,0.048004,0.071472,-0.034058,-0.116089,0.075806,-0.010887,0.108459,-0.075928,-0.006886,-0.066772,...,-0.015747,-0.171509,0.051666,-0.052063,0.083984,0.000470,-0.046844,0.057037,-0.246216,-0.022308
522100,0.043610,0.081543,-0.039429,-0.131226,0.074524,-0.004433,0.114868,-0.068359,0.007034,-0.070312,...,-0.007263,-0.178711,0.051819,-0.062744,0.071411,-0.005203,-0.063599,0.060120,-0.250977,-0.009102


In [40]:
# col_names=['query', 'target', 'qtmscore', 'ttmscore', 'alntmscore', 'fident', 'lddt', 'prob', 'alnlen', 'mismatch', 'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
col_names=["query", "target", "fident", "alnlen", "mismatch", "gapopen", "qstart", "qend", "tstart", "tend", "evalue", "bits", "prob", "lddt", "alntmscore", "qtmscore", "ttmscore"]
df = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/aa.m8',  sep='\t', index_col=None, names=col_names)
df 


  df = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/aa.m8',  sep='\t', index_col=None, names=col_names)


Unnamed: 0,query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore
0,AF-A4J5W7-F1-model_v4.pdb,AF-B1YI58-F1-model_v4.pdb,0.651,89,31,0,1,89,1,89,0.000094,263,1.000,0.9857,0.9940,0.9940,0.9940
1,AF-A4J5W7-F1-model_v4.pdb,AF-Q02WW4-F1-model_v4.pdb,0.584,89,37,0,1,89,1,89,0.000099,262,1.000,0.9868,0.9937,0.9937,0.9937
2,AF-A4J5W7-F1-model_v4.pdb,AF-Q5FQM0-F1-model_v4.pdb,0.561,89,39,0,1,89,1,89,0.000083,262,1.000,0.9715,0.9878,0.9878,0.9878
3,AF-A4J5W7-F1-model_v4.pdb,AF-A1A007-F1-model_v4.pdb,0.640,89,32,0,1,89,1,89,0.000111,261,1.000,0.9917,0.9956,0.9956,0.9956
4,AF-A4J5W7-F1-model_v4.pdb,AF-B8G3Z0-F1-model_v4.pdb,0.629,89,33,0,1,89,1,89,0.000124,259,1.000,0.995,0.9959,0.9959,0.9959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30940827,AF-O17861-F1-model_v4.pdb,AF-A8A2C6-F1-model_v4.pdb,0.065,122,105,3,137,258,15,127,6.027000,18,0.020,0.3059,0.2036,0.1027,0.1956
30940828,AF-O17861-F1-model_v4.pdb,AF-Q9D531-F1-model_v4.pdb,0.069,144,99,8,67,186,20,152,7.955000,18,0.020,0.3013,0.2500,0.1419,0.2329
30940829,AF-O17861-F1-model_v4.pdb,AF-B5X9L9-F1-model_v4.pdb,0.041,215,130,9,69,258,25,188,5.701000,18,0.020,0.2775,0.2213,0.1739,0.2213
30940830,AF-O17861-F1-model_v4.pdb,AF-D3ZVR7-F1-model_v4.pdb,0.050,120,92,4,70,181,27,132,7.955000,18,0.020,0.2625,0.2623,0.1320,0.1727


In [42]:
len(set(df['query'].values))

49529

In [32]:
df.sort_values(by=['query','prob', 'alntmscore'], ascending=False)

Unnamed: 0,query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore
1948,AF-J8HQ06-F1-model_v4,AF-O34580-F1-model_v4.pdb,0.133,659,384,31,1,494,14,650,2.700000e-15,453,1.000,0.5546,0.6626,0.66260,0.46070
1950,AF-J8HQ06-F1-model_v4,AF-P57654-F1-model_v4.pdb,0.152,670,359,30,1,494,5,641,8.175000e-15,438,1.000,0.5500,0.6605,0.66050,0.51670
1949,AF-J8HQ06-F1-model_v4,AF-P45612-F1-model_v4.pdb,0.137,656,382,30,1,494,12,645,3.515000e-15,442,1.000,0.5379,0.6593,0.65930,0.46610
1954,AF-J8HQ06-F1-model_v4,AF-Q8NVT1-F1-model_v4.pdb,0.144,658,374,28,1,494,10,642,5.651000e-15,434,1.000,0.5310,0.6585,0.65850,0.46140
1953,AF-J8HQ06-F1-model_v4,AF-Q5HN29-F1-model_v4.pdb,0.143,655,378,30,1,494,10,642,6.280000e-15,434,1.000,0.5372,0.6571,0.65710,0.45940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1943,AF-J8D1N4-F1-model_v4,AF-B1HTV8-F1-model_v4.pdb,0.037,160,82,8,30,173,249,352,4.166000e+00,22,0.040,0.2819,0.2132,0.11680,0.09034
1944,AF-J8D1N4-F1-model_v4,AF-P41109-F1-model_v4.pdb,0.112,62,51,2,11,68,23,84,4.838000e+00,21,0.034,0.2329,0.2578,0.06248,0.07358
1945,AF-J8D1N4-F1-model_v4,AF-P0C5W1-F1-model_v4.pdb,0.063,188,116,9,33,197,215,365,5.085000e+00,20,0.029,0.2691,0.2010,0.13420,0.06294
1946,AF-J8D1N4-F1-model_v4,AF-P27613-F1-model_v4.pdb,0.047,170,131,8,192,352,12,159,7.960000e+00,19,0.024,0.3154,0.2028,0.12500,0.20090
