# 镰刀菌Case
> 2025-03-20   
> zhenkun.shi@tib.cas.cn

In [20]:

import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
from config import conf as cfg
import pandas as pd
import json
import plotly.graph_objects as go
import itertools
from tools import btools
from tqdm import tqdm
import rxnrecer as production
import subprocess
import tools.bioFunctionLib as bfl
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle
FIRST_TIME_RUN = False
pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. 读取未注释的蛋白组数据
### 1.1 NCBI 蛋白数据

In [2]:
data_ncbi = bfl.fasta_to_dataframe(fasta_file='./data/ncbi_protein.fasta')
data_ncbi['seq'] = data_ncbi.seq.apply(lambda x: str(x).strip().upper())
print(f'load {len(data_ncbi)} proteins from ncbi')

# 直接统计重复序列及其对应的 uniprot_id
duplicated_seqs = data_ncbi[data_ncbi.duplicated('seq', keep=False)].groupby('seq').agg({
    'uniprot_id': list,
    'seq': 'count'
}).rename(columns={'seq': 'repeat_count'}).reset_index()

print(f'find {len(duplicated_seqs)} duplicated sequences')
# 输出结果
duplicated_seqs


load 12844 proteins from ncbi
find 12 duplicated sequences


Unnamed: 0,seq,uniprot_id,repeat_count
0,GIIVICDGRVRGPEMEKTTAQYLKEDIFVEKIHSEKIRGAYRAWDG...,"[XP_025586490.2, XP_065463036.1]",2
1,MASRSLVPSCSAHRAIRQVLTSPRSSITAFRTTAAPALSPLQAFAR...,"[XP_065463042.1, XP_065464976.1]",2
2,MAVSPVLTHTAEAAQTQTLGRAISSPLKNVTSQTSSSNGTPMPPVT...,"[XP_025581948.2, XP_065463031.1]",2
3,MGSLSYNNSPDLGWVVQKFGGTSVGKFPDKIAKDIVRATLSQNRVI...,"[XP_025581946.1, XP_065463033.1]",2
4,MKLRARGYLFYLLVIFFSSILSLSNRLGVWLLITGNGWFQGSCQEN...,"[XP_065463038.1, XP_065464974.1]",2
5,MKSLTFLVLAALPSMVYADGFVARRRGAEFNIHVRHEDDASYQPEG...,"[XP_065463037.1, XP_065464973.1]",2
6,MNQQPAAQNQAHAQASNGAGQDTWDEQRLEEAMKRLKLLHIKVRRL...,"[XP_025586493.2, XP_065463040.1]",2
7,MPNTPPPALLPPPEGIFRTFDDLMASVQRVAKDQGYGIVKLRASNY...,"[XP_025586492.1, XP_065463039.1]",2
8,MTGRGKGGKGLGKGGAKRHRKILRDNIQGITKPAIRRLARRGGVKR...,"[XP_025586653.1, XP_025592365.1]",2
9,MVNMRVQYRRRNGYNTSSNRTRVIKTPGGDIRLLHIKKRGTVPKCG...,"[XP_065463041.1, XP_065464975.1]",2


In [3]:
# drop duplicate sequences
data_ncbi =data_ncbi.drop_duplicates(subset=['seq']).reset_index(drop=True)
print(f"After drop duplicates get {len(data_ncbi)} sequences")
data_ncbi.head(3)

After drop duplicates get 12832 sequences


Unnamed: 0,uniprot_id,seq
0,XP_025580784.2,MSLLHEIISNLNSDPSRLNDFIAYLSKNHCLETLQFIQDASRYRAC...
1,XP_025580785.1,MEMRKSTSSIPWQNDRPWKQSFYGWSPETSRWQGIQSSQSSFDKSD...
2,XP_025580788.2,MVPEQQRSPRILACVLCHQRKKKCDRKSPCSFCTKAGIECIPSTPA...


## 1.2 Trembl data preparation

In [4]:
data_trembl = pd.read_csv('./data/uniprotkb_taxonomy_id_56646_2025_03_20.tsv', sep='\t').rename(columns={'Entry':'uniprot_id','EC number':'ec','Rhea ID':'rxn_id','Length':'len','Sequence':'seq'})
data_trembl['seq'] = data_trembl.seq.apply(lambda x: str(x).strip().upper())
print(f'Load {len(data_trembl)} of proteins from trembl ')
data_trembl.head(3)

Load 14016 of proteins from trembl 


Unnamed: 0,uniprot_id,Reviewed,Entry Name,Protein names,Gene Names,Organism,len,ec,rxn_id,GeneID,Gene Names (ordered locus),PubMed ID,seq
0,A0A2L2SSP6,unreviewed,A0A2L2SSP6_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,2423,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAQAQQIALERLEQVSRGLKSKVSDDVRKRSAVQLRELVVICHRDL...
1,A0A2L2ST09,unreviewed,A0A2L2ST09_9HYPO,Serine/threonine-protein kinase RIO1 (EC 2.7.1...,,Fusarium venenatum,546,2.7.11.1,RHEA:17989 RHEA:46608,,,,MDPAAPHQPPYTYTANQGYEQTEEIPRELQTQRDDGAALDNQDDDN...
2,A0A2L2T2H5,unreviewed,A0A2L2T2H5_9HYPO,Histone acetyltransferase type B catalytic sub...,,Fusarium venenatum,478,2.3.1.48,RHEA:45948,,,,MEDVTPWLSDANEAIQINLLSPSDSGLQHIATFNPRHTYSIFGDEE...


## 2. 开始注释

In [8]:
file_input_fasta = f'{cfg.CASE_DIR}fusarium_venenatum/data/ncbi_protein.fasta'

file_output_rxnrecer = f'{cfg.CASE_DIR}fusarium_venenatum/res/rxnrecer.feather'
file_output_ecrecer = f'{cfg.CASE_DIR}fusarium_venenatum/res/ecrecer.tsv'
file_output_clean = f'{cfg.CASE_DIR}fusarium_venenatum/res/clean.tsv'
file_output_msa = f'{cfg.CASE_DIR}fusarium_venenatum/res/msa.tsv'
file_out_deepec = f'{cfg.CASE_DIR}fusarium_venenatum/res/deepec'
file_out_catfam = f'{cfg.CASE_DIR}fusarium_venenatum/res/catfam.tsv'

### 2.1 RXNRECer

In [None]:
res_ensemble = production.step_by_step_prediction(input_data=data_ncbi[['uniprot_id', 'seq']], Ensemble=True)
res_ensemble.to_feather(file_output_rxnrecer)

### 2.2 CLEAN

In [None]:
cmd = f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif python /app/inference.py -i {file_input_fasta} -o {file_output_clean} -d ~/tmp/'
subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.3 ECRECer

In [None]:
cmd=f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif python /ecrecer/production.py -i {file_input_fasta} -o {file_output_ecrecer} -mode h -topk 20'
subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

'singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif python /ecrecer/production.py -i /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/data/ncbi_protein.fasta -o /hpcfs/fhome/shizhenkun/codebase/RXNRECer/case/fusarium_venenatum/res/ecrecer.tsv -mode h -topk 20'

### 2.4 MSA

In [None]:
ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)
blast_res = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=data_ncbi, k=1)         # 序列比对
blast_res = blast_res.merge(ds_train, left_on='sseqid', right_on='uniprot_id', how='left')[['id', 'reaction_id','ec_number']].rename(columns={'id':'uniprot_id', 'reaction_id':'MSA_RXN', 'ec_number':'MSA_EC'})
blast_res = data_ncbi[['uniprot_id']].merge(blast_res, on='uniprot_id', how='left').fillna('NO-PREDICTION').rename(columns={'uniprot_id':'input_id'})
blast_res.to_csv(file_output_msa, sep='\t', index=False)
blast_res.head(3)

Unnamed: 0,input_id,MSA_RXN,MSA_EC
0,XP_025580784.2,NO-PREDICTION,NO-PREDICTION
1,XP_025580785.1,NO-PREDICTION,NO-PREDICTION
2,XP_025580788.2,-,-


### 2.5 DeepEC prediction

In [None]:
cmd = f'singularity exec --nv  /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {file_input_fasta} -o {file_out_deepec}'
subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.6 CatFam

In [None]:
cmd = f'singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {file_input_fasta} -o {file_out_catfam}'
subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

## 3. 读取结果

In [9]:
# RXNRECer prediction for fusarium_venenatum
res_rxnrecer = pd.read_feather(file_output_rxnrecer)
dict_rhea2ec = btools.load_dict_rxn2ec()
res_rxnrecer['RXNRECer2EC'] = res_rxnrecer.RXNRECer.apply(lambda x: btools.transRXN2EC(rxns=x, dict_rxn2ec=dict_rhea2ec)) 
res_rxnrecer.head(5)

Unnamed: 0,input_id,RXNRECer,RXNRECer_with_prob,RXNRECer2EC
0,XP_025580784.2,-,"{'-': 0.999997, 'RHEA:10012': None, 'RHEA:1002...",-
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;-;RHEA:36295,"{'-': 0.963705, 'RHEA:10012': None, 'RHEA:1002...",REACTION-WITHOUT-EC;REACTION-WITHOUT-EC;REACTI...
2,XP_025580788.2,-,"{'-': 0.999998, 'RHEA:10012': None, 'RHEA:1002...",-
3,XP_025580789.2,-,"{'-': 0.9954900145530701, 'RHEA:10012': None, ...",-
4,XP_025580792.2,RHEA:48940;-;RHEA:11312;RHEA:14025,"{'-': 0.962675, 'RHEA:10012': None, 'RHEA:1002...",3.4.13.19;-;3.8.1.8;3.5.4.24


In [10]:
# Clean prediction for fusarium venenatum
res_clean = pd.read_csv(file_output_clean, sep='\t')[['Entry','clean_pred_ec_maxsep']].rename(columns={'Entry': 'input_id', 'clean_pred_ec_maxsep':'clean'})
res_clean.clean = res_clean.clean.apply(lambda x: x.split('/')[0].replace('EC:',''))
res_clean.head(3)

Unnamed: 0,input_id,clean
0,XP_025580784.2,2.7.7.n1
1,XP_025580785.1,3.1.4.12
2,XP_025580788.2,2.3.1.48


In [11]:
# ECRECer prediction for fusarium venenatum
res_ecrecer = pd.read_csv(file_output_ecrecer, sep='\t')[['input_id', 'dmlf_ec']].rename(columns={'dmlf_ec': 'ECRECer'})
res_ecrecer.head(3)

Unnamed: 0,input_id,ECRECer
0,XP_025586653.1,-
1,XP_025592365.1,-
2,XP_025586654.1,-


In [12]:
# MSA 
res_msa = pd.read_csv(file_output_msa, sep='\t')
res_msa.head(3)

Unnamed: 0,input_id,MSA_RXN,MSA_EC
0,XP_025580784.2,NO-PREDICTION,NO-PREDICTION
1,XP_025580785.1,NO-PREDICTION,NO-PREDICTION
2,XP_025580788.2,-,-


In [13]:
# DeepeEC
res_deepEC = btools.load_deepec_resluts(filepath=file_out_deepec+'/DeepEC_Result.txt').rename(columns={'id':'input_id', 'ec_deepec':'deepec'})
res_deepEC.head(3)

Unnamed: 0,input_id,deepec
0,XP_025580794.1,2.5.1.129
1,XP_025580807.1,3.1.1.3
2,XP_025580816.1,3.7.1.2


In [14]:
# CatFam
res_catfam = btools.load_catfam_res(file_out_catfam).rename(columns={'id': 'input_id', 'ec_catfam':'catfam'})
res_catfam.head(3)

Unnamed: 0,input_id,catfam
0,XP_025580784.2,-
1,XP_025580785.1,-
2,XP_025580788.2,-


In [15]:
res = res_rxnrecer[['input_id', 'RXNRECer', 'RXNRECer2EC']].merge(res_clean, on='input_id', how='left'
                                                        ).merge(res_ecrecer, on='input_id', how='left'
                                                        ).merge(res_msa, on='input_id', how='left'
                                                        ).merge(res_deepEC, on='input_id', how='left').fillna('NO-PREDICTION'
                                                        ).merge(res_catfam, on='input_id', how='left')
res = res[res.input_id.isin(data_ncbi.uniprot_id)]
res.head(3)

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;-;RHEA:36295,REACTION-WITHOUT-EC;REACTION-WITHOUT-EC;REACTI...,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
2,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-


## 4. 处理结果

In [16]:
res[(res.RXNRECer2EC==res.clean) & (res.RXNRECer2EC==res.ECRECer) & (res.RXNRECer2EC==res.MSA_EC) & (res.RXNRECer2EC==res.MSA_EC) & (res.RXNRECer2EC==res.deepec) & (res.RXNRECer2EC==res.catfam)]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam
132,XP_025580967.2,RHEA:20301,6.1.1.19,6.1.1.19,6.1.1.19,RHEA:20301,6.1.1.19,6.1.1.19,6.1.1.19
160,XP_025581006.1,RHEA:18561,1.3.1.70,1.3.1.70,1.3.1.70,RHEA:18561,1.3.1.70,1.3.1.70,1.3.1.70
189,XP_025581042.1,RHEA:21524,2.3.3.13,2.3.3.13,2.3.3.13,RHEA:21524,2.3.3.13,2.3.3.13,2.3.3.13
242,XP_025581123.1,RHEA:18585,5.3.1.1,5.3.1.1,5.3.1.1,RHEA:18585,5.3.1.1,5.3.1.1,5.3.1.1
283,XP_025581190.1,RHEA:23352,2.6.1.19,2.6.1.19,2.6.1.19,RHEA:23352,2.6.1.19,2.6.1.19,2.6.1.19
...,...,...,...,...,...,...,...,...,...
12302,XP_065465614.1,RHEA:10932,6.3.4.5,6.3.4.5,6.3.4.5,RHEA:10932,6.3.4.5,6.3.4.5,6.3.4.5
12472,XP_065465784.1,RHEA:13237,2.6.1.16,2.6.1.16,2.6.1.16,RHEA:13237,2.6.1.16,2.6.1.16,2.6.1.16
12485,XP_065465797.1,RHEA:22452,3.6.1.10,3.6.1.10,3.6.1.10,RHEA:22452,3.6.1.10,3.6.1.10,3.6.1.10
12570,XP_065465882.1,RHEA:58196,3.5.1.98,3.5.1.98,3.5.1.98,RHEA:58196,3.5.1.98,3.5.1.98,3.5.1.98


In [105]:
rxnpred_num_rxnrecer = len(res[res.RXNRECer!='-'])
rxnpred_num_clean = len(res[res.clean!='-'])
rxnpred_num_ecrecer = len(res[res.ECRECer!='-'])
rxnpred_num_msa_rxn = len(res[(res.MSA_RXN!='-')&(res.MSA_RXN!='NO-PREDICTION')])
rxnpred_num_msa_ec = len(res[(res.MSA_EC!='-')&(res.MSA_EC!='NO-PREDICTION')])
rxnpred_num_catfam = len(res[res.catfam!='-'])
rxnpred_num_deepec = len(res[(res.deepec!='-')&(res.deepec!='NO-PREDICTION')])


rxnpred=[['RXNRECer', rxnpred_num_rxnrecer], 
         ['ECRECer', rxnpred_num_ecrecer],
         ['CLEAN', rxnpred_num_clean],
         ['MSA_RXN', rxnpred_num_msa_rxn],
         ['MSA_EC', rxnpred_num_msa_ec],
         ['CatFam', rxnpred_num_catfam],
         ['deepEC', rxnpred_num_deepec]
         ]

rxnpred = pd.DataFrame(rxnpred, columns=['Method', 'predicted_rxn'])
rxnpred

Unnamed: 0,Method,predicted_rxn
0,RXNRECer,5266
1,ECRECer,4536
2,CLEAN,12832
3,MSA_RXN,2750
4,MSA_EC,2638
5,CatFam,1100
6,deepEC,984


In [152]:
num_proteins_in_ncbi = len(data_ncbi)
num_proteins_in_trembl = len(data_trembl)

print(f"Number of proteins in NCBI: {num_proteins_in_ncbi}")
print(f"Number of proteins in TrEMBL: {num_proteins_in_trembl}")

Number of proteins in NCBI: 12844
Number of proteins in TrEMBL: 14016


In [27]:
data_trembl[data_trembl.seq.isin(data_ncbi.seq)&(~data_trembl.rxn_id.isnull())]

Unnamed: 0,uniprot_id,Reviewed,Entry Name,Protein names,Gene Names,Organism,len,ec,rxn_id,GeneID,Gene Names (ordered locus),PubMed ID,seq
0,A0A2L2SSP6,unreviewed,A0A2L2SSP6_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,2423,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAQAQQIALERLEQVSRGLKSKVSDDVRKRSAVQLRELVVICHRDL...
1,A0A2L2ST09,unreviewed,A0A2L2ST09_9HYPO,Serine/threonine-protein kinase RIO1 (EC 2.7.1...,,Fusarium venenatum,546,2.7.11.1,RHEA:17989 RHEA:46608,,,,MDPAAPHQPPYTYTANQGYEQTEEIPRELQTQRDDGAALDNQDDDN...
2,A0A2L2T2H5,unreviewed,A0A2L2T2H5_9HYPO,Histone acetyltransferase type B catalytic sub...,,Fusarium venenatum,478,2.3.1.48,RHEA:45948,,,,MEDVTPWLSDANEAIQINLLSPSDSGLQHIATFNPRHTYSIFGDEE...
3,A0A2L2TC43,unreviewed,A0A2L2TC43_9HYPO,Pentafunctional AROM polypeptide [Includes: 3-...,,Fusarium venenatum,1568,1.1.1.25; 2.5.1.19; 2.7.1.71; 4.2.1.10; 4.2.3.4,RHEA:21096 RHEA:21256 RHEA:21968 RHEA:13121 RH...,,,,MAQAGGQDPTRISILGEPNIIVDHGLWLNFVIDDLLQNIPTSTYVL...
5,A0A2L2TK64,unreviewed,A0A2L2TK64_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,950,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAGPQESSTSSGSRKSGSRAVGQFNIGSEIGKGSFAQVYLGWHKET...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9913,A0A2L2TGT7,unreviewed,A0A2L2TGT7_9HYPO,aldehyde dehydrogenase (NAD(+)) (EC 1.2.1.3),,Fusarium venenatum,492,1.2.1.3,RHEA:16185,,,,MAEITITGAGSRKIQIPTGLFINNEFIPSTTSETLTTENPTNNTPL...
10321,A0A2L2TIE7,unreviewed,A0A2L2TIE7_9HYPO,ADP-ribose 1''-phosphate phosphatase (EC 3.1.3...,,Fusarium venenatum,220,3.1.3.84,RHEA:25029,,,,MTIRSVNDIPSLTQLYRDPDSVLSAASPNDKTSFPPVDRINTRIGL...
10605,A0A2L2TJJ1,unreviewed,A0A2L2TJJ1_9HYPO,Amine oxidase (EC 1.4.3.-),,Fusarium venenatum,486,1.4.3.-,RHEA:26414,,,,MSSKDGYSWTESQGLKSGVPCIGAISPPTNLKDNNTKYDVIVVGAG...
11116,A0A2L2TLM3,unreviewed,A0A2L2TLM3_9HYPO,"L-2-hydroxyglutarate dehydrogenase, mitochondr...",,Fusarium venenatum,427,1.1.99.2,RHEA:21252,,,,MPLQIHNSTSTIPTMLRSAARKISSNFLPSQRHNFSSTTAINADFT...


In [19]:

# List of column names
columns = ['RXNRECer2EC', 'clean', 'ECRECer', 'MSA_EC', 'deepec', 'catfam']

# Initialize a dictionary to store the intersection counts
intersection_counts = {}

# Iterate over all possible non-empty subsets (from pairs to all 6 columns)
for k in range(2, 7):  # Check intersections of 2, 3, ..., 6 columns
    for subset in itertools.combinations(columns, k):
        # Create a mask where all columns in the subset are equal
        mask = True
        for col in subset[1:]:
            mask &= (res[subset[0]] == res[col])
        # Count the number of rows where all columns in the subset are equal
        count = len(res[mask])
        intersection_counts[subset] = count

# Print the results
for subset, count in intersection_counts.items():
    print(f"{subset}: {count}")

('RXNRECer2EC', 'clean'): 256
('RXNRECer2EC', 'ECRECer'): 7454
('RXNRECer2EC', 'MSA_EC'): 3317
('RXNRECer2EC', 'deepec'): 175
('RXNRECer2EC', 'catfam'): 7758
('clean', 'ECRECer'): 1529
('clean', 'MSA_EC'): 1249
('clean', 'deepec'): 678
('clean', 'catfam'): 694
('ECRECer', 'MSA_EC'): 4492
('ECRECer', 'deepec'): 666
('ECRECer', 'catfam'): 8935
('MSA_EC', 'deepec'): 6847
('MSA_EC', 'catfam'): 4297
('deepec', 'catfam'): 434
('RXNRECer2EC', 'clean', 'ECRECer'): 256
('RXNRECer2EC', 'clean', 'MSA_EC'): 254
('RXNRECer2EC', 'clean', 'deepec'): 171
('RXNRECer2EC', 'clean', 'catfam'): 255
('RXNRECer2EC', 'ECRECer', 'MSA_EC'): 3168
('RXNRECer2EC', 'ECRECer', 'deepec'): 174
('RXNRECer2EC', 'ECRECer', 'catfam'): 7426
('RXNRECer2EC', 'MSA_EC', 'deepec'): 175
('RXNRECer2EC', 'MSA_EC', 'catfam'): 3275
('RXNRECer2EC', 'deepec', 'catfam'): 174
('clean', 'ECRECer', 'MSA_EC'): 929
('clean', 'ECRECer', 'deepec'): 583
('clean', 'ECRECer', 'catfam'): 618
('clean', 'MSA_EC', 'deepec'): 477
('clean', 'MSA_EC', 

### a-酮戊二酸到谷氨酸的例子

In [28]:
res[res.input_id=='XP_025581669.2']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam
617,XP_025581669.2,RHEA:11612,1.4.1.3;1.4.1.4,1.4.1.4,1.4.1.4,RHEA:11612,1.4.1.4,1.4.1.4,1.4.1.4


In [87]:
res[res.clean!='-']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;-;RHEA:36295,REACTION-WITHOUT-EC;REACTION-WITHOUT-EC;REACTI...,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
2,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-
3,XP_025580789.2,-,-,3.1.1.42,-,-,-,NO-PREDICTION,-
4,XP_025580792.2,RHEA:48940;-;RHEA:11312;RHEA:14025,3.4.13.19;-;3.8.1.8;3.5.4.24,3.5.4.40,3.8.1.8,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
...,...,...,...,...,...,...,...,...,...
12839,XP_065466151.1,-,-,5.4.99.30,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12840,XP_065466152.1,-,-,4.2.1.159,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12841,XP_065466153.1,-,-,2.1.1.86,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12842,XP_065466154.1,RHEA:12044;RHEA:17561;-,3.1.1.3;3.1.1.34;3.1.1.79;3.1.1.7;-,3.1.1.42,3.1.1.3,RHEA:12044,3.1.1.3,3.1.1.1,-


In [154]:
data_trembl[data_trembl.seq.isin(data_ncbi.seq)]

Unnamed: 0,uniprot_id,Reviewed,Entry Name,Protein names,Gene Names,Organism,len,ec,rxn_id,GeneID,Gene Names (ordered locus),PubMed ID,seq
0,A0A2L2SSP6,unreviewed,A0A2L2SSP6_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,2423,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAQAQQIALERLEQVSRGLKSKVSDDVRKRSAVQLRELVVICHRDL...
1,A0A2L2ST09,unreviewed,A0A2L2ST09_9HYPO,Serine/threonine-protein kinase RIO1 (EC 2.7.1...,,Fusarium venenatum,546,2.7.11.1,RHEA:17989 RHEA:46608,,,,MDPAAPHQPPYTYTANQGYEQTEEIPRELQTQRDDGAALDNQDDDN...
2,A0A2L2T2H5,unreviewed,A0A2L2T2H5_9HYPO,Histone acetyltransferase type B catalytic sub...,,Fusarium venenatum,478,2.3.1.48,RHEA:45948,,,,MEDVTPWLSDANEAIQINLLSPSDSGLQHIATFNPRHTYSIFGDEE...
3,A0A2L2TC43,unreviewed,A0A2L2TC43_9HYPO,Pentafunctional AROM polypeptide [Includes: 3-...,,Fusarium venenatum,1568,1.1.1.25; 2.5.1.19; 2.7.1.71; 4.2.1.10; 4.2.3.4,RHEA:21096 RHEA:21256 RHEA:21968 RHEA:13121 RH...,,,,MAQAGGQDPTRISILGEPNIIVDHGLWLNFVIDDLLQNIPTSTYVL...
5,A0A2L2TK64,unreviewed,A0A2L2TK64_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,950,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAGPQESSTSSGSRKSGSRAVGQFNIGSEIGKGSFAQVYLGWHKET...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13974,A0A2L2U5N4,unreviewed,A0A2L2U5N4_9HYPO,N-acetyltransferase domain-containing protein,,Fusarium venenatum,231,,,,,,MSQPSLPYNKYRIRPATYSDVPAVTRLYAWSFGNEPLIEFFFPTRK...
14004,D2JM00,unreviewed,D2JM00_9HYPO,MSF superfamily transporter,FVENE_0001,Fusarium venenatum,545,,,,,19843228,MGKETLDASVSHAESYDVEARVPTKGVHDTTDQARLGAQAEHNLPP...
14005,D2JM01,unreviewed,D2JM01_9HYPO,Acyltransferase,TRI16,Fusarium venenatum,492,,,,,19843228,MPCTQYQRTKMALLSPLDQLNSSFYLRWSLVLQVKDLNKAVGSLSK...
14006,D2JM03,unreviewed,D2JM03_9HYPO,Ubiquitin 3 binding protein But2 C-terminal do...,FVENE_0002,Fusarium venenatum,259,,,,,19843228,MRVLLFATGLSAAIVVVQAGVCKPAYTTTSHSDAVSTIESSATSAS...


In [30]:
res

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;-;RHEA:36295,REACTION-WITHOUT-EC;REACTION-WITHOUT-EC;REACTI...,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
2,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-
3,XP_025580789.2,-,-,3.1.1.42,-,-,-,NO-PREDICTION,-
4,XP_025580792.2,RHEA:48940;-;RHEA:11312;RHEA:14025,3.4.13.19;-;3.8.1.8;3.5.4.24,3.5.4.40,3.8.1.8,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
...,...,...,...,...,...,...,...,...,...
12839,XP_065466151.1,-,-,5.4.99.30,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12840,XP_065466152.1,-,-,4.2.1.159,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12841,XP_065466153.1,-,-,2.1.1.86,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-
12842,XP_065466154.1,RHEA:12044;RHEA:17561;-,3.1.1.3;3.1.1.34;3.1.1.79;3.1.1.7;-,3.1.1.42,3.1.1.3,RHEA:12044,3.1.1.3,3.1.1.1,-
