# Ecoli Case (K12, 4587) for RXNRecer

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-09-09  


## 1. Import packages

In [18]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
sys.path.insert(1,'../methods/active-learning/')
from config import conf as cfg
from tools import uniprottool as uptool
from tools import  bioFunctionLib as bfl
import production.production as production
from modules import commonfunction as cmfunc
from IPython.display import display_markdown
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import json
import pandas as pd


FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Obtain ecoli case data from public web sites

In [9]:
if FIRST_TIME_RUN:
    api_url =f'https://rest.uniprot.org/uniprotkb/search?compressed=false&format=tsv&query=%28ecoli%29%20and%20%28model_organism:83333%29&fields=accession,reviewed,protein_name,gene_names,organism_name,length,ec,rhea,sequence&size=500'
    case_ecoli = uptool.get_batch_data_from_uniprot_rest_api(url=api_url)    
    case_ecoli = pd.DataFrame(case_ecoli, columns=['uniprot_id','reviewed','protein_name','gene_names','organism_name','length','ec','reaction_id','seq'])
    case_ecoli = case_ecoli.replace('','-')
    
    case_ecoli.reaction_id = case_ecoli.reaction_id.apply(lambda x: (cfg.SPLITER).join(x.split(' ')))
    case_ecoli.ec = case_ecoli.ec.apply(lambda x: (cfg.SPLITER).join(x.split(' ')))
    
    case_ecoli.to_feather(cfg.FILE_DS_CASE_ECOLI)    #保存文件
else:
    case_ecoli = pd.read_feather(cfg.FILE_DS_CASE_ECOLI)
    print(f'Records in uniprot-rhea relation:\t{len(case_ecoli)}')

Records in uniprot-rhea relation:	4481


## 3. Predcit 

In [11]:
# 从 JSON 文件加载反应编码字典
with open(cfg.FILE_DS_DICT_RXN2ID, "r") as json_file:
    dict_rxn2id = json.load(json_file)
    print(f'加载反应编码字典完成，共有 {len(dict_rxn2id)} 个反应。')  # 打印加载的数据

加载反应编码字典完成，共有 10479 个反应。


In [16]:
bfl.table2fasta(table=case_ecoli[['uniprot_id', 'seq']], file_out=f'{cfg.TEMP_DIR}ecoli.fasta')

In [19]:
res =  production.step_by_step_prediction(input_fasta=f'{cfg.TEMP_DIR}ecoli.fasta', 
                                       dict_rxn2id=cfg.FILE_DS_DICT_RXN2ID, 
                                       rxn_info_base=cfg.RXN_JSON_DIR, 
                                       output_file=f'{cfg.TEMP_DIR}case_ecoli_rxnrecer.tsv',
                                       format = 'tsv'
                                       )

Step 1: Load reaction encoding dictionary from JSON file
Finished loading rxn2id dictionary from JSON file. Total 10479 reactions.
Step 2: Load input reaction information
Step 3: Loading predictive model
Step 4: Predicting ...


Predicting reactions: 100%|██████████| 2241/2241 [02:35<00:00, 14.40it/s]

Step 5: Saving results to /hpcfs/fhome/shizhenkun/codebase/RXNRECer/temp/case_ecoli_rxnrecer.tsv





In [20]:
res

Unnamed: 0,input_id,RXNRECer
0,Q46901,-
1,P77304,-
2,P36837,-
3,P75990,-
4,P39276,-
...,...,...
4476,K0JFS6,RHEA:46164;RHEA:46160
4477,A0A3G0QC22,-
4478,C3UPD2,-
4479,A0A0S1EZS3,-


In [8]:
print('Start active learning prediction')
print('loading model')
model, mcfg = MPRED.load_model() 
mcfg.batch_size=20
print(f'Predction parameters: {mcfg}')
print('Predicting reactions')

res = Mactive.predict_sequences(model=model, 
                        sequences=case_ecoli.seq, 
                        model_weight_path=mcfg.model_weight_path, 
                        dict_path=mcfg.dict_path, 
                        batch_size=2,
                        device=mcfg.device)


print('Add Label')
case_ecoli['RXNRECer'] = res

case_ecoli['lb_rxn_groundtruth'] = case_ecoli.reaction_id.parallel_apply(lambda x: cmfunc.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id)) #make label for each blast prediction
case_ecoli['lb_rxn_RXNRECer'] = case_ecoli.RXNRECer.parallel_apply(lambda x: cmfunc.make_label(reaction_id=str(x), rxn_label_dict=dict_rxn2id)) #make label for each blast prediction
metrics_rxn_enzyme = cmfunc.rxn_eva_metric(eva_df=case_ecoli, eva_name='[RXN ECOLI]', methods=['RXNRECer'])
metrics_rxn_enzyme

Start active learning prediction
loading model
Predction parameters: namespace(batch_size=20, esm_out_dim=1280, gru_h_dim=512, att_dim=32, dropout_rate=0.2, freeze_esm_layers=32, output_dimensions=10479, device=device(type='cuda'), model_weight_path='/hpcfs/fhome/shizhenkun/codebase/preaction/methods/active-learning/185846best_mode.pth', dict_path='/hpcfs/fhome/shizhenkun/codebase/preaction/data/datasets/task240524/dict_id2rxn.json')
Predicting reactions


Predicting reactions: 100%|██████████| 2241/2241 [02:31<00:00, 14.75it/s]

Add Label





Evaluating: Reaction Predcition Results [RXN ECOLI]


Unnamed: 0,baselineName,mAccuracy,mPrecision,mRecall,mF1
0,RXNRECer,0.927695,0.981561,0.917729,0.928603


## 4. Analysis of E. coli  prediction

In [14]:
ds_train =pd.read_feather(cfg.FILE_DS_TRAIN)
uniprot_id_in_train =  case_ecoli[case_ecoli.uniprot_id.isin(ds_train.uniprot_id)].uniprot_id.tolist()

r_in_train = case_ecoli[case_ecoli.uniprot_id.isin(uniprot_id_in_train)]
mr_in_train = cmfunc.rxn_eva_metric(eva_df=r_in_train, eva_name='[RXN ECOLI]', methods=['RXNRECer'])

r_notin_train = case_ecoli[~case_ecoli.uniprot_id.isin(uniprot_id_in_train)]
mr_notin_train = cmfunc.rxn_eva_metric(eva_df=r_notin_train, eva_name='[RXN ECOLI]', methods=['RXNRECer'])


print(f'Ecoli Records:{len(case_ecoli)}')
print(f'Overall Performance：\n{metrics_rxn_enzyme}\n')


print(f'Ecoli Records appeared in training set:{len(uniprot_id_in_train)}')
print(f'Performance：\n{mr_in_train}\n')


print(f'Ecoli Records not appeared in training set:{len(r_notin_train)}')
print(f'Performance：\n{mr_notin_train}\n')

Evaluating: Reaction Predcition Results [RXN ECOLI]
Evaluating: Reaction Predcition Results [RXN ECOLI]
Ecoli Records:4481
Overall Performance：
  baselineName  mAccuracy  mPrecision   mRecall       mF1
0     RXNRECer   0.927695    0.981561  0.917729  0.928603

Ecoli Records appeared in training set:3966
Performance：
  baselineName  mAccuracy  mPrecision  mRecall       mF1
0     RXNRECer   0.962683    0.990722  0.94547  0.946426

Ecoli Records not appeared in training set:515
Performance：
  baselineName  mAccuracy  mPrecision   mRecall       mF1
0     RXNRECer   0.658252         1.0  0.669231  0.800751



In [6]:
case_ecoli[case_ecoli.lb_rxn_groundtruth != case_ecoli.lb_rxn_RXNRECer]

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
28,P31658,reviewed,Protein/nucleic acid deglycase 1 (EC 3.1.2.-) ...,hchA yedU yzzC b1967 JW1950,Escherichia coli (strain K12),283,3.1.2.-;;3.5.1.-;;3.5.1.124;;4.2.1.130,RHEA:27754;RHEA:49548;RHEA:49552;RHEA:49556;RH...,MTVQTSKNPQVDIAEDNAFFPSEYSLSQYTSPVSDLDGVDYPKPYR...,RHEA:57304;RHEA:57300;RHEA:57248;RHEA:57260;RH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
41,P37339,reviewed,L-2-hydroxyglutarate dehydrogenase (L2HG dehyd...,lhgD lhgO ygaF b2660 JW2635,Escherichia coli (strain K12),422,1.1.5.13,RHEA:58664,MYDFVIIGGGIIGMSTAMQLIDVYPDARIALLEKESAPACHQTGHN...,RHEA:21252,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
45,P37595,reviewed,Isoaspartyl peptidase (EC 3.4.19.5) (Beta-aspa...,iaaA spt ybiK b0828 JW0812,Escherichia coli (strain K12),321,3.4.19.5,-,MGKAVIAIHGGAGAISRAQMSLQQELRYIEALSAIVETGQKMLEAG...,RHEA:21016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67,P37666,reviewed,Glyoxylate/hydroxypyruvate reductase B (EC 1.1...,ghrB tkrA yiaE b3553 JW5656,Escherichia coli (strain K12),324,1.1.1.215;;1.1.1.79;;1.1.1.81,RHEA:10992;RHEA:17905;RHEA:18657;RHEA:16653,MKPSVILYKALPDDLLQRLQEHFTVHQVANLSPQTVEQNAAIFAEA...,RHEA:17905;RHEA:10992;RHEA:18657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
76,Q46857,reviewed,"2,5-diketo-D-gluconic acid reductase A (2,5-DK...",dkgA yqhE b3012 JW5499,Escherichia coli (strain K12),275,1.1.1.274,RHEA:23828,MANPTVIKLQDGNVMPQLGLGVWQASNEEVITAIQKALEVGYRSID...,RHEA:35111,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4146,P76418,reviewed,Uncharacterized protein YegU (EC 3.2.2.-),yegU b2099 JW2086,Escherichia coli (strain K12),334,3.2.2.-,-,MKTERILGALYGQALGDAMGMPSELWPRSRVKAHFGWIDRFLPGPK...,RHEA:14493,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4312,P52048,reviewed,Uncharacterized protein YggP,yggP b4465 JW5477,Escherichia coli (strain K12),425,-,-,MKTKVAAIYGKRDVRLRVFELPEITDNELLVSVISDSVCLSTWKAA...,RHEA:19661,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4420,P71311,reviewed,Uncharacterized deacetylase YaiS (EC 3.-.-.-),yaiS b0364 JW0356,Escherichia coli (strain K12),185,3.-.-.-,-,MDKVLDSALLSSANKRKGILAIGAHPDDIELGCGASLARLAQKGIY...,RHEA:33411,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4421,P0AAV8,reviewed,Putative isomerase YbhH (EC 5.-.-.-),ybhH b0769 JW0752,Escherichia coli (strain K12),350,5.-.-.-,-,MKKIPCVMMRGGTSRGAFLLAEHLPEDQTQRDKILMAIMGSGNDLE...,RHEA:28931,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
new_findeing = case_ecoli[(case_ecoli.reaction_id=='-')&( case_ecoli.RXNRECer!='-')].reset_index(drop=True)

In [22]:
ds_rxn = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)

In [25]:
ds_rxn.to_feather(cfg.FILE_DS_RHEA_REACTIONS)

In [None]:
def get_rxn_info()

In [21]:
new_findeing

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
0,P37595,reviewed,Isoaspartyl peptidase (EC 3.4.19.5) (Beta-aspa...,iaaA spt ybiK b0828 JW0812,Escherichia coli (strain K12),321,3.4.19.5,-,MGKAVIAIHGGAGAISRAQMSLQQELRYIEALSAIVETGQKMLEAG...,RHEA:21016,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P17117,reviewed,Oxygen-insensitive NADPH nitroreductase (EC 1....,nfsA mda18 mdaA ybjB b0851 JW0835,Escherichia coli (strain K12),240,1.-.-.-,-,MTPTIELICGHRSIRHFTDEPISEAQREAIINSARATSSSSFLQCS...,RHEA:21624,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P23872,reviewed,Acetyl esterase (EC 3.1.1.-) (EcE),aes ybaC b0476 JW0465,Escherichia coli (strain K12),319,3.1.1.-,-,MKPENKLPVLDLISAEMKTVVNTLQPDLPPWPATGTIAEQRQYYTL...,RHEA:21164,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P0A8V0,reviewed,Ribonuclease BN (RNase BN) (EC 3.1.-.-) (Ribon...,rbn elaC rnz b2268 JW2263,Escherichia coli (strain K12),305,3.1.-.-,-,MELIFLGTSAGVPTRTRNVTAILLNLQHPTQSGLWLFDCGEGTQHQ...,RHEA:10748,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P75792,reviewed,Sugar phosphatase YbiV (EC 3.1.3.23),ybiV supH b0822 JW0806,Escherichia coli (strain K12),271,3.1.3.23,-,MSVKVIVTDMDGTFLNDAKTYNQPRFMAQYQELKKRGIKFVVASGN...,RHEA:25197,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
182,P76418,reviewed,Uncharacterized protein YegU (EC 3.2.2.-),yegU b2099 JW2086,Escherichia coli (strain K12),334,3.2.2.-,-,MKTERILGALYGQALGDAMGMPSELWPRSRVKAHFGWIDRFLPGPK...,RHEA:14493,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
183,P52048,reviewed,Uncharacterized protein YggP,yggP b4465 JW5477,Escherichia coli (strain K12),425,-,-,MKTKVAAIYGKRDVRLRVFELPEITDNELLVSVISDSVCLSTWKAA...,RHEA:19661,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
184,P71311,reviewed,Uncharacterized deacetylase YaiS (EC 3.-.-.-),yaiS b0364 JW0356,Escherichia coli (strain K12),185,3.-.-.-,-,MDKVLDSALLSSANKRKGILAIGAHPDDIELGCGASLARLAQKGIY...,RHEA:33411,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
185,P0AAV8,reviewed,Putative isomerase YbhH (EC 5.-.-.-),ybhH b0769 JW0752,Escherichia coli (strain K12),350,5.-.-.-,-,MKKIPCVMMRGGTSRGAFLLAEHLPEDQTQRDKILMAIMGSGNDLE...,RHEA:28931,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
case_ecoli

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
0,Q46901,reviewed,CRISPR system Cascade subunit CasA (CRISPR typ...,casA cse1 ygcL b2760 JW2730,Escherichia coli (strain K12),502,-,-,MNLLIDNWIPVRPRNGGKVQIINLQSLYCSRDQWRLSLPRDDMELA...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P77304,reviewed,Dipeptide and tripeptide permease A,dtpA tppB ydgR b1634 JW1626,Escherichia coli (strain K12),500,-,-,MSTANQKPTESVSLNAFKQPKAFYLIFSIELWERFGYYGLQGIMAV...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P36837,reviewed,Dipeptide and tripeptide permease B,dtpB yhiP b3496 JW3463,Escherichia coli (strain K12),489,-,-,MNTTTPMGMLQQPRPFFMIFFVELWERFGYYGVQGVLAVFFVKQLG...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P75990,reviewed,Blue light- and temperature-regulated antirepr...,bluF blrp ycgF b1163 JW1150,Escherichia coli (strain K12),403,-,-,MLTTLIYRSHIRDDEPVKKIEEMVSIANRRNMQSDVTGILLFNGSH...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P39276,reviewed,Dipeptide and tripeptide permease C (Dipeptide...,dtpC yjdL b4130 JW4091,Escherichia coli (strain K12),485,-,-,MKTPSQPRAIYYIVAIQIWEYFSFYGMRALLILYLTHQLGFDDNHA...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4476,K0JFS6,unreviewed,Flavoprotein WrbA,wrbA,Escherichia coli (strain K12),101,-,-,MINHILRHVETMARAVAEGASKVDGAEVVVKRVPETMPPQLFEKAG...,RHEA:46164;RHEA:46160,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4477,A0A3G0QC22,unreviewed,Transposase IS4-like domain-containing protein,yncM,Escherichia coli (strain K12),66,-,-,MSFRQKKEDKSAEGVSIRYYISSKDMDAKEFAHAIRAHWLIEHSLH...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4478,C3UPD2,unreviewed,TmRNA tag peptide,-,Escherichia coli (strain K12),10,-,-,ANDENYALAA,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4479,A0A0S1EZS3,unreviewed,Arsenical pump membrane protein,yfjV,Escherichia coli (strain K12),66,-,-,MAAGFIADTASLPLIVSNLVNIVSADFFGLSFAQYASVMISVDAAA...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
r_in_train[r_in_train.lb_rxn_groundtruth != r_in_train.lb_rxn_RXNRECer]

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
28,P31658,reviewed,Protein/nucleic acid deglycase 1 (EC 3.1.2.-) ...,hchA yedU yzzC b1967 JW1950,Escherichia coli (strain K12),283,3.1.2.-;;3.5.1.-;;3.5.1.124;;4.2.1.130,RHEA:27754;RHEA:49548;RHEA:49552;RHEA:49556;RH...,MTVQTSKNPQVDIAEDNAFFPSEYSLSQYTSPVSDLDGVDYPKPYR...,RHEA:57304;RHEA:57300;RHEA:57248;RHEA:57260;RH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
41,P37339,reviewed,L-2-hydroxyglutarate dehydrogenase (L2HG dehyd...,lhgD lhgO ygaF b2660 JW2635,Escherichia coli (strain K12),422,1.1.5.13,RHEA:58664,MYDFVIIGGGIIGMSTAMQLIDVYPDARIALLEKESAPACHQTGHN...,RHEA:21252,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67,P37666,reviewed,Glyoxylate/hydroxypyruvate reductase B (EC 1.1...,ghrB tkrA yiaE b3553 JW5656,Escherichia coli (strain K12),324,1.1.1.215;;1.1.1.79;;1.1.1.81,RHEA:10992;RHEA:17905;RHEA:18657;RHEA:16653,MKPSVILYKALPDDLLQRLQEHFTVHQVANLSPQTVEQNAAIFAEA...,RHEA:17905;RHEA:10992;RHEA:18657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
76,Q46857,reviewed,"2,5-diketo-D-gluconic acid reductase A (2,5-DK...",dkgA yqhE b3012 JW5499,Escherichia coli (strain K12),275,1.1.1.274,RHEA:23828,MANPTVIKLQDGNVMPQLGLGVWQASNEEVITAIQKALEVGYRSID...,RHEA:35111,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
88,P39396,reviewed,Pyruvate/proton symporter BtsT (Brenztraubensa...,btsT yjiY b4354 JW5791,Escherichia coli (strain K12),716,-,RHEA:64720,MDTKKIFKHIPWVILGIIGAFCLAVVALRRGEHISALWIVVASVSV...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3505,P77624,reviewed,Carbamate kinase-like protein YahI,yahI b0323 JW0315,Escherichia coli (strain K12),316,-,-,MKELVVVAIGGNSIIKDNASQSIEHQAEAVKAVADTVLEMLASDYD...,RHEA:10152,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3778,P38104,reviewed,Starvation-sensing protein RspA,rspA b1581 JW1573,Escherichia coli (strain K12),404,-,-,MKIVKAEVFVTCPGRNFVTLKITTEDGITGLGDATLNGRELSVASY...,RHEA:20097,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3791,P31061,reviewed,Prophage DNA-packing protein NohA (Prophage Qi...,nohA nohQ b1548 JW1541,Escherichia coli (strain K12),189,-,-,MEVNKKQLADIFGASIRTIQNWQEQGMPVLRGGGKGNEVLYDSAAV...,RHEA:13065,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3923,P31062,reviewed,DNA-packaging protein NU1 homolog,nohD nohB b0560 JW0549,Escherichia coli (strain K12),181,-,-,MEVNKKQLADIFGASIRTIQNWQEQGMPVLRGGGKGNEVLYDSAAV...,RHEA:13065,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
r_in_train[r_in_train.RXNRECer=='-']

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
0,Q46901,reviewed,CRISPR system Cascade subunit CasA (CRISPR typ...,casA cse1 ygcL b2760 JW2730,Escherichia coli (strain K12),502,-,-,MNLLIDNWIPVRPRNGGKVQIINLQSLYCSRDQWRLSLPRDDMELA...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,P77304,reviewed,Dipeptide and tripeptide permease A,dtpA tppB ydgR b1634 JW1626,Escherichia coli (strain K12),500,-,-,MSTANQKPTESVSLNAFKQPKAFYLIFSIELWERFGYYGLQGIMAV...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,P36837,reviewed,Dipeptide and tripeptide permease B,dtpB yhiP b3496 JW3463,Escherichia coli (strain K12),489,-,-,MNTTTPMGMLQQPRPFFMIFFVELWERFGYYGVQGVLAVFFVKQLG...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,P75990,reviewed,Blue light- and temperature-regulated antirepr...,bluF blrp ycgF b1163 JW1150,Escherichia coli (strain K12),403,-,-,MLTTLIYRSHIRDDEPVKKIEEMVSIANRRNMQSDVTGILLFNGSH...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,P39276,reviewed,Dipeptide and tripeptide permease C (Dipeptide...,dtpC yjdL b4130 JW4091,Escherichia coli (strain K12),485,-,-,MKTPSQPRAIYYIVAIQIWEYFSFYGMRALLILYLTHQLGFDDNHA...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4466,P18148,reviewed,Protein PsiA,psiA ECOK12F064,Escherichia coli (strain K12),239,-,-,MSVRSQALVPLSTEQQAAWRAVAETEKRRHQGNTLAEYPYAGAFFR...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4467,P19768,reviewed,Insertion element IS150 protein InsJ (ORFA),insJ b3557 JW3527,Escherichia coli (strain K12),173,-,-,MSKPKYPFEKRLEVVNHYFTTDDGYRIISARFGVPRTQVRTWVALY...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4468,Q47719,reviewed,Putative protein YjhV,yjhV b4286 JW4246,Escherichia coli (strain K12),137,-,-,MVGYHQTNQKTDTGKTLTRRPVLVDHNRLPEGSRGRLAVAVAGDHP...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4469,Q9JMR3,reviewed,UPF0401 protein YubL,yubL yfjA ECOK12F061,Escherichia coli (strain K12),79,-,-,MRMSEYFRILQGLPDGPFTRKHAEAVAAQYRNVFIEDDHGEQFRLV...,-,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
case_ecoli

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq
0,Q46901,reviewed,CRISPR system Cascade subunit CasA (CRISPR typ...,casA cse1 ygcL b2760 JW2730,Escherichia coli (strain K12),502,-,-,MNLLIDNWIPVRPRNGGKVQIINLQSLYCSRDQWRLSLPRDDMELA...
1,P77304,reviewed,Dipeptide and tripeptide permease A,dtpA tppB ydgR b1634 JW1626,Escherichia coli (strain K12),500,-,-,MSTANQKPTESVSLNAFKQPKAFYLIFSIELWERFGYYGLQGIMAV...
2,P36837,reviewed,Dipeptide and tripeptide permease B,dtpB yhiP b3496 JW3463,Escherichia coli (strain K12),489,-,-,MNTTTPMGMLQQPRPFFMIFFVELWERFGYYGVQGVLAVFFVKQLG...
3,P75990,reviewed,Blue light- and temperature-regulated antirepr...,bluF blrp ycgF b1163 JW1150,Escherichia coli (strain K12),403,-,-,MLTTLIYRSHIRDDEPVKKIEEMVSIANRRNMQSDVTGILLFNGSH...
4,P39276,reviewed,Dipeptide and tripeptide permease C (Dipeptide...,dtpC yjdL b4130 JW4091,Escherichia coli (strain K12),485,-,-,MKTPSQPRAIYYIVAIQIWEYFSFYGMRALLILYLTHQLGFDDNHA...
...,...,...,...,...,...,...,...,...,...
4476,K0JFS6,unreviewed,Flavoprotein WrbA,wrbA,Escherichia coli (strain K12),101,-,-,MINHILRHVETMARAVAEGASKVDGAEVVVKRVPETMPPQLFEKAG...
4477,A0A3G0QC22,unreviewed,Transposase IS4-like domain-containing protein,yncM,Escherichia coli (strain K12),66,-,-,MSFRQKKEDKSAEGVSIRYYISSKDMDAKEFAHAIRAHWLIEHSLH...
4478,C3UPD2,unreviewed,TmRNA tag peptide,-,Escherichia coli (strain K12),10,-,-,ANDENYALAA
4479,A0A0S1EZS3,unreviewed,Arsenical pump membrane protein,yfjV,Escherichia coli (strain K12),66,-,-,MAAGFIADTASLPLIVSNLVNIVSADFFGLSFAQYASVMISVDAAA...


In [13]:
case_ecoli[case_ecoli.reaction_id!='-']

Unnamed: 0,uniprot_id,reviewed,protein_name,gene_names,organism_name,length,ec,reaction_id,seq,RXNRECer,lb_rxn_groundtruth,lb_rxn_RXNRECer
10,P03960,reviewed,Potassium-transporting ATPase ATP-binding subu...,kdpB b0697 JW0685,Escherichia coli (strain K12),682,7.2.2.6,RHEA:16777,MSRKQLALFEPTLVVQALKEAVKKLNPQAQWRNPVMFIVWIGSLLT...,RHEA:16777,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13,P0A9P0,reviewed,Dihydrolipoyl dehydrogenase (EC 1.8.1.4) (Dihy...,lpdA lpd b0116 JW0112,Escherichia coli (strain K12),474,1.8.1.4,RHEA:15045,MSTEIKTQVVVLGAGPAGYSAAFRCADLGLETVIVERYNTLGGVCL...,RHEA:15045,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16,P25539,reviewed,Riboflavin biosynthesis protein RibD [Includes...,ribD ribG ybaE b0414 JW0404,Escherichia coli (strain K12),367,1.1.1.193;;3.5.4.26,RHEA:21868;RHEA:17845,MQDEYYMARALKLAQRGRFTTHPNPNVGCVIVKDGEIVGEGYHQRA...,RHEA:17845;RHEA:21868,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17,P76251,reviewed,D-malate dehydrogenase [decarboxylating] (EC 1...,dmlA yeaU b1800 JW1789,Escherichia coli (strain K12),361,1.1.1.83,RHEA:18365,MMKTMRIAAIPGDGIGKEVLPEGIRVLQAAAERWGFALSFEQMEWA...,RHEA:18365,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
18,P08200,reviewed,Isocitrate dehydrogenase [NADP] (IDH) (EC 1.1....,icd icdA icdE b1136 JW1122,Escherichia coli (strain K12),416,1.1.1.42,RHEA:19629,MESKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAML...,RHEA:19629,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
3218,P77495,reviewed,Propionate--CoA ligase (EC 6.2.1.17) (Propiony...,prpE yahU b0335 JW0326,Escherichia coli (strain K12),628,6.2.1.17,RHEA:20373,MSFSEFYQRSINEPEQFWAEQARRIDWQTPFTQTLDHSNPPFARWF...,RHEA:20373,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3223,P0AAE5,reviewed,Putative arginine/ornithine antiporter,ydgI b1605 JW1597,Escherichia coli (strain K12),460,-,RHEA:34991,MEKKLGLSALTALVLSSMLGAGVFSLPQNMAAVASPAALLIGWGIT...,RHEA:34991,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3239,P0AEX5,reviewed,Probable phosphoribulokinase (PRK) (PRKase) (E...,prkB yhfF b3355 JW3318,Escherichia coli (strain K12),289,2.7.1.19,RHEA:19365,MSAKHPVIAVTGSSGAGTTTTSLAFRKIFAQLNLHAAEVEGDSFHR...,RHEA:19365,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3630,P0A9U8,reviewed,Probable acyl-CoA dehydrogenase YdiO (EC 1.3.-.-),ydiO b1695 JW5275,Escherichia coli (strain K12),383,1.3.-.-,RHEA:48608,MDFSLTEEQELLLASIRELITTNFPEEYFRTCDQNGTYPREFMRAL...,RHEA:48608,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
