## Methods: 基于蛋白结构的反应预测
> 2024-12-20

### 1. 导入必要的包

In [3]:
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tqdm import tqdm
from tools import bioFunctionLib as bfl 
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
FIRST_TIME_RUN = False
import subprocess

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. Load test data

In [5]:
def load_10folds_data(type='test'):
    if type == 'test':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/valid.feather' for fold_num in range(10)]
    if type =='train':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/train.feather' for fold_num in range(10)]
    res = [pd.read_feather(path)[['uniprot_id','seq','reaction_id']].rename(columns={'reaction_id': 'rxn_groundtruth'}) for path in file_path]
    return res


print('Loading pdbfile ...' )
data_pdb = pd.read_feather(cfg.FILE_DS_PDB_LIST)


print('Loading 10-folds  data ...' )
data_trian = load_10folds_data(type='train')
data_test = load_10folds_data(type='test')

Loading uniprot_rxn_dict ...
Loading 10-folds test data ...


Unnamed: 0,uniprot_id,seq,rxn_groundtruth
105768,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-
195319,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669
135884,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-


### 3. MakeDBs

In [3]:
if FIRST_TIME_RUN:
    trains_ids = [item.uniprot_id.to_list() for item in data_trian]
    for i in tqdm(range(10)):
        fold_num=i
        db_name = f'fold{i+1}'
        prep_df = data_pdb[data_pdb.uniprot_id.isin(trains_ids[fold_num]).reset_index(drop=True)]
        bfl.make_foldseek_db(prp_df=prep_df, db_name=db_name)

### 4. Make testDBS

In [4]:
if FIRST_TIME_RUN:
    test_ids = [item.uniprot_id.to_list() for item in data_test]
    for i in tqdm(range(10)):
        fold_num=i
        db_name = f'fold{i+1}'
        prep_df = data_pdb[data_pdb.uniprot_id.isin(test_ids[fold_num]).reset_index(drop=True)]
        bfl.gather_test_pdb_db(prp_df=prep_df, db_name=db_name)


### 5. 执行比对脚本

In [13]:
# fold_num = 1
# fseek_cols='query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore'
# fseek_cmd =f'time foldseek easy-search {cfg.DIR_FOLDSEEK_PDB}fold{fold_num}/pdb_test/ {cfg.DIR_FOLDSEEK_PDB}fold1/DB/fold1 \
# {cfg.RESULTS_DIR}/structural/foldseek_res_fold{fold_num}.m8 {cfg.TEMP_DIR} --alignment-type 0 --format-output {fseek_cols} \
# --tmscore-threshold 0.2 --tmalign-hit-order 0 --max-seqs 1000'

# fseek_cmd

In [87]:
# Writing the SLURM script to a file named 's3-1_structure_foldseek.slurm'

script_content = """#!/bin/bash
#SBATCH --job-name=foldseek_batch                       # 作业名称
#SBATCH --partition=qcpu_23i                            # 队列名称
#SBATCH --nodes=1                                       # 每个作业使用一个节点
#SBATCH --ntasks-per-node=1                             # 每个节点的任务数（单任务）
#SBATCH --cpus-per-task=63                              # 每个任务使用的 CPU 核心数
#SBATCH --mem=500G                                      # 每个节点使用的内存
#SBATCH --array=0-9                                     # 数组任务索引范围
#SBATCH --output=slurmlog/RXNRECer-FOLDSEEK-%A_%a.out   # 标准输出文件
#SBATCH --error=slurmlog/RXNRECer-FOLDSEEK-%A_%a.err    # 标准错误文件

# 配置变量
FOLD_NUM=$((SLURM_ARRAY_TASK_ID + 1)) # 将数组索引转换为 fold_num (1 到 10)
CFG_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/foldseek"
RESULTS_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/structural"
BASE_TEMP_DIR="/hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp"
TEMP_DIR="${BASE_TEMP_DIR}/task_${SLURM_ARRAY_TASK_ID}" # 动态临时目录
FSEEK_COLS="query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore,qtmscore,ttmscore"

# 创建日志目录（如果不存在）
mkdir -p slurmlog

# 为当前任务创建独立的临时目录
mkdir -p ${TEMP_DIR}

# 输出任务开始信息
echo "Job started: $(date)"
echo "Running foldseek for fold_num=${FOLD_NUM} with ${SLURM_CPUS_PER_TASK} threads"
echo "Task is running on node: $(hostname)"
echo "Temporary directory: ${TEMP_DIR}"

# 设置 OpenMP 线程数
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}

# 运行命令
time foldseek easy-search ${CFG_DIR}/fold${FOLD_NUM}/pdb_test/ ${CFG_DIR}/fold{FOLD_NUM}/DB/fold{FOLD_NUM} \
${RESULTS_DIR}/foldseek_res_fold${FOLD_NUM}.m8 ${TEMP_DIR} \
--alignment-type 0 --format-output ${FSEEK_COLS} \
--tmscore-threshold 0.2 --tmalign-hit-order 0 --max-seqs 1000

# 清理任务临时目录（可选，如果需要保留临时文件，注释掉以下行）
rm -rf ${TEMP_DIR}
echo "Deleted temporary directory: ${TEMP_DIR}"

# 输出任务结束信息
echo "Job ended: $(date)"
"""

file_path = "s3-1_structure_foldseek.slurm"

with open(file_path, "w") as file:
    file.write(script_content)

file_path

# ! sbatch s3-1_structure_foldseek.slurm

's3-1_structure_foldseek.slurm'

### 6. Load results

In [12]:
res_fold_seek = [f'{cfg.RESULTS_DIR}structural/foldseek_res_fold{item}.m8' for item in range(1,11)]
col_names=["query", "target", "fident", "alnlen", "mismatch", "gapopen", "qstart", "qend", "tstart", "tend", "evalue", "bits", "prob", "lddt", "alntmscore", "qtmscore", "ttmscore"]
foldseek_structural_pred_res = [f'{cfg.RESULTS_DIR}intermediate/structural/foldseekaln_fold{i+1}.tsv' for i in range(10)]

if FIRST_TIME_RUN: # 如果第一次跑，添加groundtruth
    for i in tqdm(range(10)):
        df = pd.read_table(res_fold_seek[i],  sep='\t', index_col=None, names=col_names, low_memory=False)[['query', 'target', 'fident', 'alntmscore']]
        df = df.loc[df.groupby('query')['alntmscore'].idxmax()].reset_index(drop=True)
        df.insert(0, 'uniprot_id', df['query'].apply(lambda x: x.split('-')[1]))
        df.insert(1, 'ref_id', df['target'].apply(lambda x: x.split('-')[1]))
        df = df.merge(data_trian[i], left_on='ref_id', right_on='uniprot_id', how='left')[['uniprot_id_x', 'ref_id','fident', 'alntmscore', 'rxn_groundtruth']].rename(columns={'uniprot_id_x':'uniprot_id', 'rxn_groundtruth':'rxn_foldseekaln'})
        df = data_test[i].merge(df, on='uniprot_id', how='left')
        df.rxn_foldseekaln = df.rxn_foldseekaln.fillna('NO-PREDICTION')
        df.to_csv(foldseek_structural_pred_res[i], sep='\t', index=False)
        print(f'File saved to:{foldseek_structural_pred_res[i]} with {len(df)} records')
    
fold_res = [ pd.read_csv(file, sep='\t') for file in foldseek_structural_pred_res ] 

In [9]:
for i in tqdm(range(10)):
    df = pd.read_table(res_fold_seek[i],  sep='\t', index_col=None, names=col_names, low_memory=False)[['query', 'target', 'fident', 'alntmscore']]
    df = df.loc[df.groupby('query')['alntmscore'].idxmax()].reset_index(drop=True)
    df.insert(0, 'uniprot_id', df['query'].apply(lambda x: x.split('-')[1]))
    df.insert(1, 'ref_id', df['target'].apply(lambda x: x.split('-')[1]))
    df = df.merge(data_trian[i], left_on='ref_id', right_on='uniprot_id', how='left')[['uniprot_id_x', 'ref_id','fident', 'alntmscore', 'rxn_groundtruth']].rename(columns={'uniprot_id_x':'uniprot_id', 'rxn_groundtruth':'rxn_foldseekaln'})
    df = data_test[i].merge(df, on='uniprot_id', how='left')
    df.rxn_foldseekaln = df.rxn_foldseekaln.fillna('NO-PREDICTION')
    df.to_csv(foldseek_structural_pred_res[i], sep='\t', index=False)
    print(f'File saved to:{foldseek_structural_pred_res[i]} with {len(df)} records')

 10%|█         | 1/10 [00:43<06:34, 43.78s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold1.tsv with 50858 records


 20%|██        | 2/10 [01:27<05:50, 43.82s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold2.tsv with 50858 records


 30%|███       | 3/10 [02:11<05:07, 43.94s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold3.tsv with 50858 records


 40%|████      | 4/10 [02:55<04:23, 43.95s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold4.tsv with 50858 records


 50%|█████     | 5/10 [03:40<03:40, 44.11s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold5.tsv with 50858 records


 60%|██████    | 6/10 [04:24<02:56, 44.11s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold6.tsv with 50858 records


 70%|███████   | 7/10 [05:09<02:13, 44.42s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold7.tsv with 50858 records


 80%|████████  | 8/10 [05:54<01:29, 44.55s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold8.tsv with 50858 records


 90%|█████████ | 9/10 [06:37<00:44, 44.23s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold9.tsv with 50858 records


100%|██████████| 10/10 [07:22<00:00, 44.30s/it]

File saved to:/hpcfs/fhome/shizhenkun/codebase/RXNRECer/results/intermediate/structural/foldseekaln_fold10.tsv with 50858 records





NameError: name 'fold_res' is not defined