# Baseline Performance Comparison - EC-based methods
> Author: Shi Zhenkun   
> Email: zhenkun.shi@tib.cas.cn   
> Last update: 2024-10-29



## 0. Import packages

In [1]:
import os,sys
import pandas as pd

sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import tools.bioFunctionLib as bfl
import tools.btools as btools
from tqdm import tqdm
import subprocess
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from tkinter import _flatten # type: ignore
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1. Define Common Functions

In [2]:
#拼合测试集与预测结果    
def merge_test_set(testset, needmergeset, mergekey='uniprot_id', merge_way='left', fillna='NO-PREDICTION'):
    testset = testset[['uniprot_id', 'reaction_id', 'ec_number']].rename(columns={'reaction_id':'reaction_groundtruth','ec_number':'ec_groundtruth'})
    res = testset.merge(needmergeset, on=mergekey, how=merge_way).fillna(fillna)
    # res = testset.merge(needmergeset, on=mergekey, how=merge_way)
    return res

## 2. Load exp data -Fold 10

In [13]:
print('Loading validation datasets fasta path ...')
vali_fasta_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.fasta' 
    for fold_index in range(1, 11)
]

print('Loading validation datasets feather path ...')
vali_feather_files = [
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/valid.feather' 
    for fold_index in range(1, 11)
]

print('Loading EC2RXN map...')

#add reaction data
ds_rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)
# caculate reaction_id ec_number map
ec_reaction_map = ds_rhea[['reaction_id', 'ec_number']].copy()
ec_reaction_map = ec_reaction_map.fillna('REACTION-WITHOUT-EC')
ec_reaction_map = pd.concat([ec_reaction_map, pd.DataFrame({'reaction_id':['NO-PREDICTION', '-'],'ec_number':['NO-PREDICTION', '-']})], axis=0).reset_index(drop=True)
ec_reaction_map.ec_number = ec_reaction_map.ec_number.apply(lambda x: x.replace('EC:',''))
ec_reaction_map = ec_reaction_map.assign(ec_number=ec_reaction_map['ec_number'].str.split(';')).explode('ec_number').reset_index(drop=True).rename(columns={'ec_number': 'ec'})


ec_reaction_map.head(3)


Loading validation datasets fasta path ...
Loading validation datasets feather path ...
Loading EC2RXN map...


Unnamed: 0,reaction_id,ec
0,RHEA:22636,3.6.1.9
1,RHEA:22636,3.6.1.12
2,RHEA:22636,3.6.1.65


## 3. EC-based baselines

### 3.1 Blast

In [14]:
print('Loading trainning datasets...')
train_feather_files =[
    f'{cfg.DIR_DATASET}validation/fold{fold_index}/train.feather'
    for fold_index in range(1,11)
]

vali_res_blast = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/blast/fold{item}.tsv' 
    for item in range(1, 11)
]

def blast_via_train_test(ds_train, ds_test):
    #blsting test set
    blast_res_ec = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=ds_test[['uniprot_id', 'seq']], k=1)
    blast_res_ec= blast_res_ec[['id','sseqid']].merge(ds_train[['uniprot_id', 'ec_number']], left_on='sseqid', right_on='uniprot_id', how='left')[['id','ec_number']].rename(columns={'id':'uniprot_id', 'ec_number':'ec_blast'})    
    #拼合test，加入无预测结果的数据
    blast_res_ec=ds_test[['uniprot_id', 'reaction_id','ec_number']].rename(columns={'reaction_id':'reaction_groundtruth', 'ec_number': 'ec_groundtruth'}).merge(blast_res_ec, on='uniprot_id', how='left').fillna('NO-PREDICTION')
    #add rxn info
    blast_res_ec['reaction_ecblast']=blast_res_ec.ec_blast.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    return blast_res_ec

print('Runing prediction for each fold...')

# for i in tqdm(range(10)):
#     ds_train = pd.read_feather(train_feather_files[i])
#     ds_test = pd.read_feather(vali_feather_files[i])
#     blast_res_ec = blast_via_train_test(ds_train,ds_test)  
#     blast_res_ec.to_csv(f'{vali_res_blast[i]}.tsv', sep='\t', index=False)
    
pd.read_csv(f'{vali_res_blast[0]}', sep='\t').head(3)
    

Loading trainning datasets...
Runing prediction for each fold...


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_blast,reaction_ecblast
0,Q9UYB6,-,-,-,-
1,C1AQW9,RHEA:19669,3.6.5.-,3.6.5.-,EC-WITHOUT-REACTION
2,P64647,-,-,-,-


### 3.2 DeepEC

In [15]:
vali_res_deepec = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/deepec/fold{item}' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv --cleanenv /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {vali_fasta_files[item]} -o {vali_res_deepec[item]}'
    for item in range(10)
]

# # Execute each command in the shell and print output
# for cmd in commands:
#     print(f'Executing: {cmd}')
#     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
#     print(result.stdout)
#     if result.stderr:
#         print(f'Error: {result.stderr}')


vali_res_deepec_rxn = [f'{cfg.DIR_RES_BASELINE}results/ec_methods/deepec/fold_{item}.tsv' for item in range(1, 11)]

print('Format DeepEC results')
def format_clean_results(indexer):

    #format EC resluts from clean
    res = btools.load_deepec_resluts(filepath=f'{vali_res_deepec[indexer]}/DeepEC_Result.txt').rename(columns={'id': 'uniprot_id'})
    # 拼合groundtruth
    ds_test = pd.read_feather(vali_feather_files[indexer])
    res= merge_test_set(testset=ds_test, needmergeset= res)
    res['reaction_deepec']=res.ec_deepec.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    res = res.replace('', 'EC-WITHOUT-REACTION')
    res.to_csv(vali_res_deepec_rxn[indexer], sep='\t', index=False)
    print(f'Write clean results to: {vali_res_deepec_rxn[indexer]} ')


[format_clean_results(indexer=i) for i in range(10)]
pd.read_csv(vali_res_deepec_rxn[0], sep='\t').head(3)


Making command lines...
Format DeepEC results
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_1.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_2.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_3.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_4.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_5.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_6.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_7.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/deepec/fold_8.tsv 
Write clean results to: /hpcfs/fhome/shizh

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_deepec,reaction_deepec
0,Q9UYB6,-,-,NO-PREDICTION,NO-PREDICTION
1,C1AQW9,RHEA:19669,3.6.5.-,3.6.5.n1,RHEA:19669
2,P64647,-,-,NO-PREDICTION,NO-PREDICTION


### 3.3 CLEAN

In [85]:
vali_res_clean = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/clean/res_fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif '
    f'python /app/inference.py -i {vali_fasta_files[item]} -o {vali_res_clean[item]} -d /hpcfs/fhome/shizhenkun/codebase/public_data/featurebank/protein/esm_embd_650_per_protein/'
    for item in range(10)
]

# Execute each command in the shell and print output
print('Executing commands...')
# for cmd in commands:
#     print(f'Executing: {cmd}')
#     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
#     print(result.stdout)
#     if result.stderr:
#         print(f'Error: {result.stderr}')

# clean res with rxn
vali_res_clean_rxn = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/clean/fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Format clean results')
def format_clean_results(indexer):

    #format EC resluts from clean
    test_clean = btools.load_clean_resluts(res_file=vali_res_clean[indexer])
    # 拼合groundtruth
    ds_test = pd.read_feather(vali_feather_files[indexer])
    test_clean= merge_test_set(testset=ds_test, needmergeset= test_clean)
    test_clean['reaction_clean']=test_clean.ec_clean.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    test_clean = test_clean.replace('', 'EC-WITHOUT-REACTION')
    test_clean.to_csv(vali_res_clean_rxn[indexer], sep='\t', index=False)
    print(f'Write clean results to: {vali_res_clean_rxn[indexer]} ')
    
[format_clean_results(indexer=i) for i in range(10)]

pd.read_csv(vali_res_clean_rxn[0], sep='\t').head(3)

Loading validation datasets...
Making command lines...
Executing commands...
Format clean results
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_1.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_2.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_3.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_4.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_5.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_6.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_7.tsv 
Write clean results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/clean/fold_8.tsv

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_clean,reaction_clean
0,Q9UYB6,-,-,5.3.1.29,RHEA:32243
1,C1AQW9,RHEA:19669,3.6.5.-,3.6.5.n1,RHEA:19669
2,P64647,-,-,1.14.18.3,RHEA:30355


### 3.4 ECRECer

In [124]:
print('Using ECRECEer')
vali_res_ecrecer = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/ecrecer/res_fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif '
    f'python /ecrecer/production.py -i {vali_fasta_files[item]} -o {vali_res_ecrecer[item]} -mode h -topk 20'
    
    for item in range(10)
]

# # Execute each command in the shell and print output
# for cmd in commands:
#     print(f'Executing: {cmd}')
#     result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
#     print(result.stdout)
#     if result.stderr:
#         print(f'Error: {result.stderr}')


print('Format ECRECer results')
# clean res with rxn
vali_res_ecrecer_rxn = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/ecrecer/fold_{item}.tsv' 
    for item in range(1, 11)
]



def format_ecrecer_results(indexer):

    #format EC resluts from clean
    test_ecrecer = pd.read_csv(vali_res_ecrecer[indexer], sep='\t')[['input_id','dmlf_ec']].rename(columns={'input_id':'uniprot_id', 'dmlf_ec':'ec_ecrecer'}) 
    # 拼合groundtruth
    ds_test = pd.read_feather(vali_feather_files[indexer])
    test_ecrecer= merge_test_set(testset=ds_test, needmergeset= test_ecrecer)
    
    test_ecrecer['rxn_ecrecer']=test_ecrecer.ec_ecrecer.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    test_ecrecer = test_ecrecer.replace('', 'EC-WITHOUT-REACTION')
    test_ecrecer.to_csv(vali_res_ecrecer_rxn[indexer], sep='\t', index=False)
    print(f'Write ECRECer results to: {vali_res_ecrecer_rxn[indexer]} ')
    
    
[format_ecrecer_results(i) for i in range(10)]


pd.read_csv(vali_res_ecrecer_rxn[0], sep='\t').head(3)

Using ECRECEer
Loading validation datasets...
Making command lines...
Format ECRECer results


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecrecer,rxn_ecrecer
0,Q9UYB6,-,-,-,-
1,C1AQW9,RHEA:19669,3.6.5.-,3.6.5.-,EC-WITHOUT-REACTION
2,P64647,-,-,-,-


### 3.5 CATFAM

In [23]:
print('Using CatFam')
print('Loading validation datasets...')
vali_res_catfam = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/catfam/res_fold_{item}.tsv' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif'
    f' /catfam/source/catsearch.pl -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {vali_fasta_files[item]} -o {vali_res_catfam[item]}'
    for item in range(10)
]

def execute_command(cmd):
    print(f'Executing: {cmd}')
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.stderr:
        print(f'Error: {result.stderr}')
    return result.stdout

# Execute commands in parallel
# with ThreadPoolExecutor(max_workers=10) as executor:
#     futures = [executor.submit(execute_command, cmd) for cmd in commands]
#     for future in as_completed(futures):
#         output = future.result()
#         print(output)


print('Format CatFam results')
# clean res with rxn
vali_res_catfam_rxn = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/catfam/fold_{item}.tsv' 
    for item in range(1, 11)
]


def format_catfam_results(indexer):
    
    # load results
    res = btools.load_catfam_res(resfile=vali_res_catfam[indexer]).rename(columns={'id':'uniprot_id'}) 
    res = res.fillna('-') # catfam 所有输入进去的序列都可以返回结果，故返回的结果中没有EC号的被认为是非酶
    res = res.groupby('uniprot_id').agg({ 'uniprot_id':'first',  'ec_catfam': ';'.join}).reset_index(drop=True)
    
    # 拼合groundtruth
    ds_test = pd.read_feather(vali_feather_files[indexer])
    test_catfam = merge_test_set(testset=ds_test, needmergeset=res)
    test_catfam['rxn_catfam']=test_catfam.ec_catfam.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
    test_catfam = test_catfam.replace('', 'EC-WITHOUT-REACTION')

    test_catfam.to_csv(vali_res_catfam_rxn[indexer], sep='\t', index=False)
    print(f'Write ECRECer results to: {vali_res_catfam_rxn[indexer]} ')
    
    

[format_catfam_results(i) for i in range(10)]

pd.read_csv(vali_res_catfam_rxn[0], sep='\t').head(3)

Using CatFam
Loading validation datasets...
Making command lines...
Format CatFam results
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_1.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_2.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_3.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_4.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_5.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_6.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/catfam/fold_7.tsv 
Write ECRECer results to: /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/c

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_catfam,rxn_catfam
0,Q9UYB6,-,-,-,-
1,C1AQW9,RHEA:19669,3.6.5.-,-,-
2,P64647,-,-,-,-


### 3.6 PRIAM

In [17]:
print('Using PRIAM')
vali_res_priam = [
    f'{cfg.DIR_RES_BASELINE}results/ec_methods/priam/res_fold_{item}/' 
    for item in range(1, 11)
]

print('Making command lines...')
commands = [
    f'singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif'
    f' /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i {vali_fasta_files[item]} -o {vali_res_priam[item]} --blast_path /opt/blast-2.2.26/bin -np 100'
    for item in range(10)
]



def execute_command(cmd):
    print(f'Executing: {cmd}')
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.stderr:
        print(f'Error: {result.stderr}')
    return result.stdout

#Execute commands in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(execute_command, cmd) for cmd in commands]
    for future in as_completed(futures):
        output = future.result()
        print(output)

Using PRIAM
Making command lines...
Executing: singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold1/valid.fasta -o /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/priam/res_fold_1/ --blast_path /opt/blast-2.2.26/bin -np 100
Executing: singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/data/datasets/task240524/validation/fold2/valid.fasta -o /hpcfs/fhome/shizhenkun/codebase/RXNRECer/baselines/results/ec_methods/priam/res_fold_2/ --blast_path /opt/blast-2.2.26/bin -np 100
Executing: singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/

In [5]:
priam_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/'
print(f'''priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i {input_fasta} -o {priam_res_file} --blast_path /opt/blast-2.2.26/bin -np 100
 ''')

priam cmd:
 singularity exec /hpcfs/fpublic/container/singularity/app/priam/priam.sif /opt/jdk1.7.0_80/bin/java -Xmx128G -jar /opt/priam/PRIAM_search.jar -p /opt/priam/PRIAM_JAN18  -i /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta -o /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/priam/ --blast_path /opt/blast-2.2.26/bin -np 100
 


In [6]:
# %conda activate DMLF
#! rm -rf /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18/PROFILES/LIBRARY
#! java -Xmx128G -jar ./PRIAM_search.jar -p /home/shizhenkun/codebase/DMLF/baselines/priam/PRIAM_JAN18 -i /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta -o /home/shizhenkun/codebase/preaction/results/baselines/priam/alfp0921 --blast_path /home/shizhenkun/downloads/blast-2.2.26/bin -np 100

# load results
priam = btools.load_praim_res(resfile=f'{cfg.RESULTS_DIR}baselines/priam/PRIAM_20240708030102/ANNOTATION/sequenceECs.txt').rename(columns={'id':'uniprot_id'})
test_priam = merge_test_set(testset=ds_test, needmergeset=priam)
test_priam['reaction_priam']=test_priam.ec_priam.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_priam = test_priam.replace('', 'EC-WITHOUT-REACTION')

test_priam.to_csv(cfg.FILE_RESULTS_PRIAM, sep='\t', index=False)

test_priam.head(3)

Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_priam,reaction_priam,isRight_reaction_catfam
0,A9JLI2,-,-,NO-PREDICTION,NO-PREDICTION,False
1,A9JLI3,-,-,1.14.11.51;2.3.2.27,RHEA:49524,False
2,A9JLI5,-,-,6.5.1.3,EC-WITHOUT-REACTION,False


### 3.6 ECPred

In [28]:
ECPred_res_file = '/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt'
singularity_ecpred = '/hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif'
print(f'''ECpred cmd: 
singularity exec {singularity_ecpred} java  -Xmx128G -jar /ECPred/ECPred.jar spmap  {input_fasta} /ECPred/ /tmp {ECPred_res_file}
      ''')

ECpred cmd: 
singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta /ECPred/ /tmp /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt
      


In [None]:
!singularity exec /hpcfs/fpublic/container/singularity/app/ecpred/ecpred.sif  java  -Xmx128G -jar /ECPred/ECPred.jar spmap  /hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/ds_test.fasta   /ECPred/  /hpcfs/fhome/shizhenkun/codebase/preaction/temp  /hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt  

In [18]:
# ! java -Xmx128G -jar /home/shizhenkun/codebase/DMLF/baselines/ECPred/ECPred.jar spmap /home/shizhenkun/codebase/preaction/data/datasets/ds_test.fasta /home/shizhenkun/codebase/DMLF/baselines/ECPred/ /home/shizhenkun/codebase/preaction/temp /home/shizhenkun/codebase/preaction/results/baselines/ecpred/alfp0921.txt
# load results
ecpred = btools.load_ecpred_res(resfile=f'/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/baselines/ecpred/test.txt')
ecpred = ecpred.groupby('id').agg({ 'id':'first',  'ec_ecpred': ';'.join}).reset_index(drop=True).replace('no Prediction', 'NO-PREDICTION').rename(columns={'id':'uniprot_id'})

test_ecpred = merge_test_set(testset=ds_test, needmergeset=ecpred)
test_ecpred['reaction_ecpred']=test_ecpred.ec_ecpred.parallel_apply(lambda x: btools.retrival_reaction_from_ec(ec_pred=x, ec_reaction_map=ec_reaction_map))
test_ecpred = test_ecpred.replace('', 'EC-WITHOUT-REACTION')


test_ecpred.to_csv(cfg.FILE_RESULTS_ECPRED, sep='\t', index=False)
test_ecpred.head(3)


Unnamed: 0,uniprot_id,reaction_groundtruth,ec_groundtruth,ec_ecpred,reaction_ecpred,isRight_reaction_ecpred
0,A9JLI2,-,-,-,-,True
1,A9JLI3,-,-,-,-,True
2,A9JLI5,-,-,-,-,True
