# Baseline Performance Comparison - EC-based methods
> Author: Shi Zhenkun   
> Email: zhenkun.shi@tib.cas.cn   
> Last update: 2024-10-29



## 0. Import packages

In [1]:
import os,sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import tools.bioFunctionLib as bfl
import tools.btools as btools
from tqdm import tqdm
import subprocess
from tkinter import _flatten # type: ignore
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1. Define Common Functions

In [2]:
#拼合测试集与预测结果    
def merge_test_set(testset, needmergeset, mergekey='uniprot_id', merge_way='left', fillna='NO-PREDICTION'):
    testset = testset[['uniprot_id', 'reaction_id', 'ec_number']].rename(columns={'reaction_id':'reaction_groundtruth','ec_number':'ec_groundtruth'})
    res = testset.merge(needmergeset, on=mergekey, how=merge_way).fillna(fillna)
    # res = testset.merge(needmergeset, on=mergekey, how=merge_way)
    return res

## 2. Load exp data -Fold 10

In [3]:
print('Loading validation datasets...')
vali_fasta_files = [f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.fasta' for fold_index in range(1, 11)]
vali_df_list = [pd.read_feather(f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.feather') for fold_index in range(1, 11)]


print('Loading EC2RXN map...')

#add reaction data
ds_rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)
# caculate reaction_id ec_number map
ec_reaction_map = ds_rhea[['reaction_id', 'ec_number']].copy()
ec_reaction_map = ec_reaction_map.fillna('REACTION-WITHOUT-EC')
ec_reaction_map = pd.concat([ec_reaction_map, pd.DataFrame({'reaction_id':['NO-PREDICTION', '-'],'ec_number':['NO-PREDICTION', '-']})], axis=0).reset_index(drop=True)
ec_reaction_map.ec_number = ec_reaction_map.ec_number.apply(lambda x: x.replace('EC:',''))
ec_reaction_map = ec_reaction_map.assign(ec_number=ec_reaction_map['ec_number'].str.split(';')).explode('ec_number').reset_index(drop=True).rename(columns={'ec_number': 'ec'})


vali_df_list[0].head(3)


Loading validation datasets...
Loading EC2RXN map...


Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
105768,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
195319,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669,3.6.5.-,1,4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
135884,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## 3. EC-based baselines

In [4]:
print('Loading validation datasets...')
vali_fasta_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.fasta' 
    for fold_index in range(1, 11)
]

Loading validation datasets...


### 3.1 Blast

In [44]:
print('Loading validation datasets...')
vali_feather_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.feather' 
    for fold_index in range(1, 11)
]

print('Loading trainning datasets...')
vali_train_feather_files =[
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/train.feather'
    for fold_index in range(1,11)
]


vali_res_blast = [
    f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/blast/fold{item}.tsv' 
    for item in range(1, 11)
]

def blast_via_train_test(ds_train, ds_test):
    #blsting test set
    blast_res_ec = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)
    blast_res_ec= blast_res_ec[['id','sseqid']].merge(ds_train[['uniprot_id', 'ec_number']], left_on='sseqid', right_on='uniprot_id', how='left')[['id','ec_number']].rename(columns={'id':'uniprot_id', 'ec_number':'ec_blast'})    
    
    #拼合test，加入无预测结果的数据
    blast_res_ec=ds_test[['uniprot_id', 'reaction_id','ec_number']].rename(columns={'reaction_id':'reaction_groundtruth', 'ec_number': 'ec_groundtruth'}).merge(blast_res_ec, on='uniprot_id', how='left').fillna('NO-PREDICTION')
    
    #add rxn info
    blast_res_ec['reaction_ecblast']=blast_res_ec.ec_blast.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    
    
    return blast_res_ec

print('Runing prediction for each fold...')

for i in tqdm(range(10)):
    ds_train = pd.read_feather(vali_train_feather_files[i])
    ds_test = pd.read_feather(vali_feather_files[i])
    blast_res_ec = blast_via_train_test(ds_train,ds_test)  
    blast_res_ec.to_csv(f'{vali_res_blast[i]}.tsv', sep='\t', index=False)
    

Loading validation datasets...
Loading trainning datasets...
Runing prediction for each fold...


100%|██████████| 10/10 [08:04<00:00, 48.44s/it]


### 3.2 DeepEC

In [None]:
vali_res_deepec = [
    f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold{item}' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv --cleanenv /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {vali_fasta_files[item]} -o {vali_res_deepec[item]}'
    for item in range(10)
]

# Execute each command in the shell and print output
for cmd in commands:
    print(f'Executing: {cmd}')
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print(f'Error: {result.stderr}')

In [6]:
# 生成 DeepEC 结果文件路径
res_deepec_files = [f'{item}/DeepEC_Result.txt' for item in vali_res_deepec]
# 加载 DeepEC 结果并重命名列
res_deepec = [btools.load_deepec_resluts(filepath=item).rename(columns={'id': 'uniprot_id'}) for item in res_deepec_files]
# 合并测试集和 DeepEC 结果
res_deepec = [merge_test_set(testset=vali_df_list[i], needmergeset=res_deepec[i], fillna='NO-PREDICTION') for i in range(10)]

# 计算反应
for i in range(10):
    res_deepec[i]['reaction_deepec'] = res_deepec[i].ec_deepec.parallel_apply(
        lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map)
    )
    
# 生成 TSV 文件路径并保存结果
res_deepec_files_tsv = [f'{cfg.DIR_RES_BASELINE}results/ec_methods/deepec/res_fold_{i}.tsv' for i in range(1, 11)]
for i in range(10):
    res_deepec[i].to_csv(res_deepec_files_tsv[i], sep='\t', index=False)

# 结果示例数据    
pd.read_csv(res_deepec_files_tsv[i], sep='\t').head(3)

### 3.2 CLEAN

In [21]:
print('Loading validation datasets...')
vali_fasta_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.fasta' 
    for fold_index in range(1, 11)
]
vali_res_clean = [
    f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/res_fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif '
    f'python /app/inference.py -i {vali_fasta_files[item]} -o {vali_res_clean[item]} -d /hpcfs/fhome/shizhenkun/codebase/public_data/featurebank/protein/esm_embd_650_per_protein/'
    for item in range(10)
]

# # Execute each command in the shell and print output
# for cmd in commands:
#     print(f'Executing: {cmd}')
#     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
#     print(result.stdout)
#     if result.stderr:
#         print(f'Error: {result.stderr}')

Loading validation datasets...
Making command lines...


In [14]:
#format EC resluts from clean
test_clean = btools.load_clean_resluts(res_file='/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/clean/clean.txt_maxsep.csv').rename(columns={'Entry':'uniprot_id'})

# 拼合groundtruth
test_clean= merge_test_set(testset=ds_test, needmergeset= test_clean)
test_clean['reaction_clean']=test_clean.ec_clean.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_clean = test_clean.replace('', 'EC-WITHOUT-REACTION')


test_clean.to_csv(cfg.FILE_RESULTS_CLEAN, sep='\t', index=False)
print(f'Write clean results to: {cfg.FILE_RESULTS_CLEAN} ')
test_clean.head(3)

Write clean results to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/exp_test_clean.tsv 


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_clean,reaction_clean,isRight_reaction_clean
0,A9JLI2,-,-,3.2.2.6;1.4.3.2;4.2.3.81,RHEA:31427;RHEA:16301;RHEA:13781,False
1,A9JLI3,-,-,4.6.1.18,EC-WITHOUT-REACTION,False
2,A9JLI5,-,-,1.4.3.2,RHEA:13781,False


### 3.3 ECRECer

In [14]:
print('Using ECRECEer')
print('Loading validation datasets...')
vali_fasta_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.fasta' 
    for fold_index in range(1, 11)
]
vali_res_ecrecer = [
    f'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/ecrecer/res_fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif '
    f'python /ecrecer/production.py -i {vali_fasta_files[item]} -o {vali_res_ecrecer[item]} -mode h -topk 20'
    
    for item in range(10)
]

# Execute each command in the shell and print output
for cmd in commands:
    print(f'Executing: {cmd}')
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    print(result.stdout)
    if result.stderr:
        print(f'Error: {result.stderr}')


Using ECRECEer
Loading validation datasets...
Making command lines...
Executing: singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif python /ecrecer/production.py -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/valid.fasta -o /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/ecrecer/res_fold_1.tsv -mode h -topk 20
INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
run in hybrid mode
step 1: loading data
step 2: find existing data
step 3: Embedding
Transferred model to GPU
step 4: run prediction
running in hybird mode
step 4: sequence alignment
use blast db:/ecrecer/data/uniprot_blast_db/production_blast.dmnd
Write finished
diamond blastp -d /ecrecer/data/uniprot_blast_db/production_blast.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
step 5: predict isEnzyme

   1/1590 [....

In [23]:
ecrecer_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecrecer.tsv'
print(f'''deepec cmd: 
conda activate DMLF
python /hpcfs/fhome/shizhenkun/codebase/DMLF/production.py  -i {input_fasta} -o {deepec_res_file} -mode p
      ''')

f' singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer_sandbox python /ecrecer/production.py -i ~/sample_10.fasta -o ~/res_sample_10_ecrecer.tsv -mode h -topk 10'

deepec cmd: 
conda activate DMLF
python /hpcfs/fhome/shizhenkun/codebase/DMLF/production.py  -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/deepec -mode p
      


In [15]:

dmlf = pd.read_csv('/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecrecer/ECRECer.csv', sep=',')[['id_input','ec_pred']].rename(columns={'id_input':'uniprot_id', 'ec_pred':'ec_ecrecer'})
test_dmlf = merge_test_set(testset=ds_test, needmergeset= dmlf)

test_dmlf.ec_ecrecer = test_dmlf.ec_ecrecer.apply(lambda x: x.replace(',',';'))
test_dmlf['reaction_ecrecer']=test_dmlf.ec_ecrecer.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_dmlf = test_dmlf.replace('', 'EC-WITHOUT-REACTION')

test_dmlf.to_csv(cfg.FILE_RESULTS_ECRECER, sep='\t', index=False)
print(f'Write ecrecer results to: {cfg.FILE_RESULTS_ECRECER} ')
test_dmlf.head(3)

Write ecrecer results to: /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/exp_test_ecrecer.tsv 


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecrecer,reaction_ecrecer,isRight_reaction_ecrecer
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.4 CATFAM

In [36]:
catfam_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt'
print(f'''catfam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl  -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {input_fasta}  -o {catfam_res_file} ''')

catfam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl  -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta  -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt 


In [8]:
# load results
catfam = btools.load_catfam_res(resfile=f'/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/catfam/catfam.txt').rename(columns={'id':'uniprot_id'}) 
catfam = catfam.fillna('-') # catfam 所有输入进去的序列都可以返回结果，故返回的结果中没有EC号的被认为是非酶
catfam = catfam.groupby('uniprot_id').agg({ 'uniprot_id':'first',  'ec_catfam': ';'.join}).reset_index(drop=True)

test_catfam = merge_test_set(testset=ds_test, needmergeset=catfam)
test_catfam['reaction_catfam']=test_catfam.ec_catfam.parallel_apply(lambda x: cfunc.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_catfam = test_catfam.replace('', 'EC-WITHOUT-REACTION')


test_catfam.to_csv(cfg.FILE_RESULTS_CATFAM, sep='\t', index=False)

test_catfam.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_catfam,reaction_catfam,isRight_reaction_catfam
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.5 PRIAM

In [5]:
priam_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/'
print(f'''priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i {input_fasta} -o {priam_res_file} --blast_path /opt/blast-2.2.26/bin -np 100
 ''')

priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/ --blast_path /opt/blast-2.2.26/bin -np 100
 


In [6]:
# %conda activate DMLF
#! rm -rf /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18/PROFILES/LIBRARY
#! java -Xmx128G -jar ./PRIAM_search.jar -p /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18 -i /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta -o /home/shizhenkun/codebase/preaction/results/baselines/priam/alfp0921 --blast_path /home/shizhenkun/downloads/blast-2.2.26/bin -np 100

# load results
priam = btools.load_praim_res(resfile=f'{cfg.RESULTS_DIR}baselines/priam/PRIAM_20240708030102/ANNOTATION/sequenceECs.txt').rename(columns={'id':'uniprot_id'})
test_priam = merge_test_set(testset=ds_test, needmergeset=priam)
test_priam['reaction_priam']=test_priam.ec_priam.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_priam = test_priam.replace('', 'EC-WITHOUT-REACTION')

test_priam.to_csv(cfg.FILE_RESULTS_PRIAM, sep='\t', index=False)

test_priam.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_priam,reaction_priam,isRight_reaction_catfam
0,A9JLI2,-,-,NO-PREDICTION,NO-PREDICTION,False
1,A9JLI3,-,-,1.14.11.51;2.3.2.27,RHEA:49524,False
2,A9JLI5,-,-,6.5.1.3,EC-WITHOUT-REACTION,False


### 3.6 ECPred

In [28]:
ECPred_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt'
singularity_ecpred = '/hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif'
print(f'''ECpred cmd: 
singularity exec {singularity_ecpred} java  -Xmx128G -jar /ECPred/ECPred.jar spmap  {input_fasta} /ECPred/ /tmp {ECPred_res_file}
      ''')

ECpred cmd: 
singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta /ECPred/ /tmp /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt
      


In [None]:
!singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif  java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta   /ECPred/  /hpcfs/fhome/shizhenkun/codebase/preaction/temp  /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt  

In [18]:
# ! java -Xmx128G -jar /home/shizhenkun/codebase/DMLF/baselines/ECPred/ECPred.jar spmap /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta /home/shizhenkun/codebase/DMLF/baselines/ECPred/ /home/shizhenkun/codebase/preaction/temp /home/shizhenkun/codebase/preaction/results/baselines/ecpred/alfp0921.txt
# load results
ecpred = btools.load_ecpred_res(resfile=f'/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt')
ecpred = ecpred.groupby('id').agg({ 'id':'first',  'ec_ecpred': ';'.join}).reset_index(drop=True).replace('no Prediction', 'NO-PREDICTION').rename(columns={'id':'uniprot_id'})

test_ecpred = merge_test_set(testset=ds_test, needmergeset=ecpred)
test_ecpred['reaction_ecpred']=test_ecpred.ec_ecpred.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_ecpred = test_ecpred.replace('', 'EC-WITHOUT-REACTION')


test_ecpred.to_csv(cfg.FILE_RESULTS_ECPRED, sep='\t', index=False)
test_ecpred.head(3)


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecpred,reaction_ecpred,isRight_reaction_ecpred
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True


### 3.7 Blast

In [7]:
#trainning set
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)

#blsting test set
blast_res_ec = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)    
blast_res_ec= blast_res_ec[['id','sseqid']].merge(ds_train[['uniprot_id', 'ec_number']], left_on='sseqid', right_on='uniprot_id', how='left')[['id','ec_number']].rename(columns={'id':'uniprot_id', 'ec_number':'ec_ecblast'})
#拼合test，加入无预测结果的数据
blast_res_ec=ds_test[['uniprot_id', 'reaction_id','ec_number']].rename(columns={'reaction_id':'reaction_groundtruth', 'ec_number': 'ec_groundtruth'}).merge(blast_res_ec, on='uniprot_id', how='left').fillna('NO-PREDICTION')
#add rxn info
blast_res_ec['reaction_ecblast']=blast_res_ec.ec_ecblast.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))

blast_res_ec.to_csv(cfg.FILE_RESULTS_BLAST_EC,sep='\t', index=False)

blast_res_ec.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecblast,reaction_ecblast
0,A9JLI2,-,-,-,-
1,A9JLI3,-,-,-,-
2,A9JLI5,-,-,-,-


In [7]:
cfg.FILE_WEB_PROTEIONS

'/hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/web/web_proteins.feather'

In [4]:
rxn = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)
rxn.head(3)

Unnamed: 0,reaction_id,equation,chebi_id,ec_number,equation_chebi,equation_smiles,equation_chebi_balanced
0,RHEA:22636,dCTP + H2O = dCMP + diphosphate + H(+),CHEBI:61481;CHEBI:15377;CHEBI:57566;CHEBI:3301...,EC:3.6.1.9;EC:3.6.1.12;EC:3.6.1.65,CHEBI:61481 + CHEBI:15377 = CHEBI:57566 + CHEB...,Nc1ccn([C@H]2C[C@H](O)[C@@H](COP([O-])(=O)OP([...,CHEBI:61481 + CHEBI:15377 = CHEBI:57566 + CHEB...
1,RHEA:22640,NADP(+) + sphinganine = 3-oxosphinganine + H(+...,CHEBI:58349;CHEBI:57817;CHEBI:58299;CHEBI:1537...,EC:1.1.1.102,CHEBI:58349 + CHEBI:57817 = CHEBI:58299 + CHEB...,NC(=O)c1ccc[n+](c1)[C@@H]1O[C@H](COP([O-])(=O)...,CHEBI:58349 + CHEBI:57817 = CHEBI:58299 + CHEB...
2,RHEA:22644,O2 + protopine + reduced [NADPH--hemoprotein r...,CHEBI:15379;CHEBI:16415;CHEBI:57618;CHEBI:1710...,EC:1.14.14.98,CHEBI:15379 + CHEBI:16415 + CHEBI:57618 = CHEB...,O=O.CN1CCc2cc3OCOc3cc2C(=O)Cc2ccc3OCOc3c2C1.Cc...,CHEBI:15379 + CHEBI:16415 + CHEBI:57618 = CHEB...


In [21]:
rxn['ler']=rxn.equation_chebi.apply(lambda x: set(x.split(' = ')[1].split(' + ')) == set(x.split(' = ')[0].split(' + ')))

In [25]:
rxn[rxn.ler&rxn.equation.str.contains('out')]

Unnamed: 0,reaction_id,equation,chebi_id,ec_number,equation_chebi,equation_smiles,equation_chebi_balanced,ler
793,RHEA:27950,L-phenylalanine(in) = L-phenylalanine(out),CHEBI:58095,,CHEBI:58095 = CHEBI:58095,[NH3+][C@@H](Cc1ccccc1)C([O-])=O>>[NH3+][C@@H]...,CHEBI:58095 = CHEBI:58095,True
895,RHEA:28450,N(6)-(D-psicosyl)-L-lysine(in) = N(6)-(D-psico...,CHEBI:61403,,CHEBI:61403 = CHEBI:61403,[NH3+][C@@H](CCCC[NH2+]CC(=O)[C@H](O)[C@H](O)[...,CHEBI:61403 = CHEBI:61403,True
896,RHEA:28454,N(6)-(D-fructosyl)-L-lysine(in) = N(6)-(D-fruc...,CHEBI:61393,,CHEBI:61393 = CHEBI:61393,[NH3+][C@@H](CCCC[NH2+]CC(=O)[C@@H](O)[C@H](O)...,CHEBI:61393 = CHEBI:61393,True
897,RHEA:28458,biotin(in) = biotin(out),CHEBI:57586,,CHEBI:57586 = CHEBI:57586,[H][C@]12CS[C@@H](CCCCC([O-])=O)[C@@]1([H])NC(...,CHEBI:57586 = CHEBI:57586,True
898,RHEA:28474,D-glucarate(in) + H(+)(in) = D-glucarate(out) ...,CHEBI:30612;CHEBI:15378,,CHEBI:30612 + CHEBI:15378 = CHEBI:30612 + CHEB...,O[C@@H]([C@H](O)[C@@H](O)C([O-])=O)[C@H](O)C([...,CHEBI:30612 + CHEBI:15378 = CHEBI:30612 + CHEB...,True
...,...,...,...,...,...,...,...,...
9743,RHEA:76631,Ca(2+)(in) + n H(+)(out) = Ca(2+)(out) + n H(+...,CHEBI:29108;CHEBI:15378,,CHEBI:29108 + CHEBI:15378 = CHEBI:29108 + CHEB...,[Ca++].[H+]>>[Ca++].[H+],CHEBI:29108 + CHEBI:15378 = CHEBI:29108 + CHEB...,True
9744,RHEA:76635,n H(+)(out) + Mn(2+)(in) = n H(+)(in) + Mn(2+)...,CHEBI:15378;CHEBI:29035,,CHEBI:15378 + CHEBI:29035 = CHEBI:15378 + CHEB...,[H+].[Mn++]>>[H+].[Mn++],CHEBI:15378 + CHEBI:29035 = CHEBI:15378 + CHEB...,True
9745,RHEA:76639,cytosine(out) = cytosine(in),CHEBI:16040,,CHEBI:16040 = CHEBI:16040,Nc1cc[nH]c(=O)n1>>Nc1cc[nH]c(=O)n1,CHEBI:16040 = CHEBI:16040,True
9746,RHEA:76643,xanthine(out) = xanthine(in),CHEBI:17712,,CHEBI:17712 = CHEBI:17712,O=c1[nH]c2[nH]cnc2c(=O)[nH]1>>O=c1[nH]c2[nH]cn...,CHEBI:17712 = CHEBI:17712,True


In [9]:
rxn[rxn.reaction_id=='RHEA:60184'].values

array([['RHEA:60184',
        'ATP + D-glucose(out) + H2O = ADP + D-glucose(in) + H(+) + phosphate',
        'CHEBI:30616;CHEBI:4167;CHEBI:15377;CHEBI:456216;CHEBI:15378;CHEBI:43474',
        None,
        'CHEBI:30616 + CHEBI:4167 + CHEBI:15377 = CHEBI:456216 + CHEBI:4167 + CHEBI:15378 + CHEBI:43474',
        'Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP([O-])(=O)OP([O-])(=O)OP([O-])([O-])=O)[C@@H](O)[C@H]1O.OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O.[H]O[H]>>Nc1ncnc2n(cnc12)[C@@H]1O[C@H](COP([O-])(=O)OP([O-])([O-])=O)[C@@H](O)[C@H]1O.OC[C@H]1OC(O)[C@H](O)[C@@H](O)[C@@H]1O.[H+].OP([O-])([O-])=O',
        'CHEBI:30616 + CHEBI:4167 + CHEBI:15377 = CHEBI:456216 + CHEBI:4167 + CHEBI:15378 + CHEBI:43474']],
      dtype=object)