# 镰刀菌Case
> 2025-05-23   
> zhenkun.shi@tib.cas.cn

In [1]:

import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
from config import conf as cfg
import pandas as pd
import json
import plotly.graph_objects as go
import itertools
from tools import btools
from tqdm import tqdm
import rxnrecer as production
import subprocess
import tools.bioFunctionLib as bfl
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle
FIRST_TIME_RUN = False
pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1. 读取未注释的蛋白组数据
### 1.1 NCBI 蛋白数据

In [2]:
data_ncbi = bfl.fasta_to_dataframe(fasta_file='./data/ncbi_protein.fasta')
data_ncbi['seq'] = data_ncbi.seq.apply(lambda x: str(x).strip().upper())
print(f'load {len(data_ncbi)} proteins from ncbi')

# 直接统计重复序列及其对应的 uniprot_id
duplicated_seqs = data_ncbi[data_ncbi.duplicated('seq', keep=False)].groupby('seq').agg({
    'uniprot_id': list,
    'seq': 'count'
}).rename(columns={'seq': 'repeat_count'}).reset_index()

print(f'find {len(duplicated_seqs)} duplicated sequences')
# 输出结果
print(duplicated_seqs.uniprot_id)

# drop duplicate sequences
data_ncbi =data_ncbi.drop_duplicates(subset=['seq']).reset_index(drop=True)
print(f"After drop duplicates get {len(data_ncbi)} sequences")
data_ncbi['seq_len'] = data_ncbi['seq'].apply(len)
data_ncbi.head(3)


load 12844 proteins from ncbi
find 12 duplicated sequences
0     [XP_025586490.2, XP_065463036.1]
1     [XP_065463042.1, XP_065464976.1]
2     [XP_025581948.2, XP_065463031.1]
3     [XP_025581946.1, XP_065463033.1]
4     [XP_065463038.1, XP_065464974.1]
5     [XP_065463037.1, XP_065464973.1]
6     [XP_025586493.2, XP_065463040.1]
7     [XP_025586492.1, XP_065463039.1]
8     [XP_025586653.1, XP_025592365.1]
9     [XP_065463041.1, XP_065464975.1]
10    [XP_025581947.1, XP_065463032.1]
11    [XP_025581945.1, XP_065463034.1]
Name: uniprot_id, dtype: object
After drop duplicates get 12832 sequences


Unnamed: 0,uniprot_id,seq,seq_len
0,XP_025580784.2,MSLLHEIISNLNSDPSRLNDFIAYLSKNHCLETLQFIQDASRYRAC...,254
1,XP_025580785.1,MEMRKSTSSIPWQNDRPWKQSFYGWSPETSRWQGIQSSQSSFDKSD...,331
2,XP_025580788.2,MVPEQQRSPRILACVLCHQRKKKCDRKSPCSFCTKAGIECIPSTPA...,96


## 1.2 Trembl data preparation

In [3]:
data_trembl = pd.read_csv('./data/uniprotkb_taxonomy_id_56646_2025_03_20.tsv', sep='\t').rename(columns={'Entry':'uniprot_id','EC number':'ec','Rhea ID':'rxn_id','Length':'len','Sequence':'seq'})
data_trembl['seq'] = data_trembl.seq.apply(lambda x: str(x).strip().upper())
print(f'Load {len(data_trembl)} of proteins from trembl ')
data_trembl.head(3)

Load 14016 of proteins from trembl 


Unnamed: 0,uniprot_id,Reviewed,Entry Name,Protein names,Gene Names,Organism,len,ec,rxn_id,GeneID,Gene Names (ordered locus),PubMed ID,seq
0,A0A2L2SSP6,unreviewed,A0A2L2SSP6_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,2423,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAQAQQIALERLEQVSRGLKSKVSDDVRKRSAVQLRELVVICHRDL...
1,A0A2L2ST09,unreviewed,A0A2L2ST09_9HYPO,Serine/threonine-protein kinase RIO1 (EC 2.7.1...,,Fusarium venenatum,546,2.7.11.1,RHEA:17989 RHEA:46608,,,,MDPAAPHQPPYTYTANQGYEQTEEIPRELQTQRDDGAALDNQDDDN...
2,A0A2L2T2H5,unreviewed,A0A2L2T2H5_9HYPO,Histone acetyltransferase type B catalytic sub...,,Fusarium venenatum,478,2.3.1.48,RHEA:45948,,,,MEDVTPWLSDANEAIQINLLSPSDSGLQHIATFNPRHTYSIFGDEE...


## 2. 开始注释

In [4]:
file_input_fasta = f'{cfg.CASE_DIR}fusarium_venenatum/data/ncbi_protein.fasta'

file_output_rxnrecer = f'{cfg.CASE_DIR}fusarium_venenatum/res/fusarium_ensemble_0524_morning.pkl'
file_output_ecrecer = f'{cfg.CASE_DIR}fusarium_venenatum/res/ecrecer.tsv'
file_output_clean = f'{cfg.CASE_DIR}fusarium_venenatum/res/clean.tsv'
file_output_msa = f'{cfg.CASE_DIR}fusarium_venenatum/res/msa.tsv'
file_out_deepec = f'{cfg.CASE_DIR}fusarium_venenatum/res/deepec'
file_out_catfam = f'{cfg.CASE_DIR}fusarium_venenatum/res/catfam.tsv'
file_out_priam = f'{cfg.CASE_DIR}fusarium_venenatum/res/priam.tsv'


### 2.1 RXNRECer

In [None]:
# res_ensemble = production.step_by_step_prediction(input_data=data_ncbi[['uniprot_id', 'seq']], Ensemble=True)
# res_ensemble.to_pickle(file_output_rxnrecer)

### 2.2 CLEAN

In [7]:
# cmd = f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/clean/clean.sif python /app/inference.py -i {file_input_fasta} -o {file_output_clean} -d ~/tmp/'
# subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.3 ECRECer

In [8]:
# cmd=f'singularity exec --nv /hpcfs/fpublic/container/singularity/app/ecrecer/ecrecer.sif python /ecrecer/production.py -i {file_input_fasta} -o {file_output_ecrecer} -mode h -topk 20'
# subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.4 MSA

In [9]:
# ds_train = pd.read_feather(cfg.FILE_DS_TRAIN)
# blast_res = bfl.getblast(train=ds_train[['uniprot_id', 'seq']], test=data_ncbi, k=1)         # 序列比对
# blast_res = blast_res.merge(ds_train, left_on='sseqid', right_on='uniprot_id', how='left')[['id', 'reaction_id','ec_number']].rename(columns={'id':'uniprot_id', 'reaction_id':'MSA_RXN', 'ec_number':'MSA_EC'})
# blast_res = data_ncbi[['uniprot_id']].merge(blast_res, on='uniprot_id', how='left').fillna('NO-PREDICTION').rename(columns={'uniprot_id':'input_id'})
# blast_res.to_csv(file_output_msa, sep='\t', index=False)
# blast_res.head(3)

### 2.5 DeepEC prediction

In [10]:
# cmd = f'singularity exec --nv  /hpcfs/fpublic/container/singularity/app/deepec/deepec.sif python /opt/deepec/deepec.py -i {file_input_fasta} -o {file_out_deepec}'
# subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.6 CatFam

In [11]:
# cmd = f'singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {file_input_fasta} -o {file_out_catfam}'
# subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

### 2.7 PRIAM

In [None]:
# cmd = f'singularity exec /hpcfs/fpublic/container/singularity/app/catfam/catfam.sif /catfam/source/catsearch.pl -d /catfam/CatFamDB/CatFam_v2.0/CatFam4D99R -i {file_input_fasta} -o {file_out_priam}'
# subprocess.run(cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

## 3. 读取结果

In [5]:
# RXNRECer prediction for fusarium_venenatum
res_rxnrecer = pd.read_pickle(file_output_rxnrecer)

dict_rhea2ec = btools.load_dict_rxn2ec()
res_rxnrecer['RXNRECer2EC'] = res_rxnrecer.RXNRECer.apply(lambda x: btools.transRXN2EC(rxns=x, dict_rxn2ec=dict_rhea2ec)) 
res_rxnrecer['RXNRECer2EC'] = res_rxnrecer.RXNRECer2EC.apply(lambda x: (cfg.SPLITER).join(set(x.split(cfg.SPLITER))))  # EC去重

# Clean prediction for fusarium venenatum
res_clean = pd.read_csv(file_output_clean, sep='\t')[['Entry','clean_pred_ec_maxsep']].rename(columns={'Entry': 'input_id', 'clean_pred_ec_maxsep':'clean'})
res_clean.clean = res_clean.clean.apply(lambda x: x.split('/')[0].replace('EC:',''))

# ECRECer prediction for fusarium venenatum
res_ecrecer = pd.read_csv(file_output_ecrecer, sep='\t')[['input_id', 'dmlf_ec']].rename(columns={'dmlf_ec': 'ECRECer'})

# MSA 
res_msa = pd.read_csv(file_output_msa, sep='\t')

# DeepeEC
res_deepEC = btools.load_deepec_resluts(filepath=file_out_deepec+'/DeepEC_Result.txt').rename(columns={'id':'input_id', 'ec_deepec':'deepec'})

# CatFam
res_catfam = btools.load_catfam_res(file_out_catfam).rename(columns={'id': 'input_id', 'ec_catfam':'catfam'})

# PRIAM
res_priam = pd.read_csv(file_out_priam, sep='\t', names=['input_id', 'PRIAM'])
res_priam = (
    res_priam
    .groupby('input_id')['PRIAM']
    .apply(lambda x: ';'.join(x.dropna().astype(str)) if x.notna().any() else 'NO-PREDICTION')
    .reset_index()
)

res = res_rxnrecer[['input_id', 'RXNRECer', 'RXNRECer2EC']].merge(res_clean, on='input_id', how='left'
                                                        ).merge(res_ecrecer, on='input_id', how='left'
                                                        ).merge(res_msa, on='input_id', how='left'
                                                        ).merge(res_deepEC, on='input_id', how='left').fillna('NO-PREDICTION'
                                                        ).merge(res_catfam, on='input_id', how='left'
                                                        ).merge(res_priam, on='input_id', how='left')
res = res[res.input_id.isin(data_ncbi.uniprot_id)]

res = res.merge(data_ncbi[['uniprot_id','seq_len']].rename(columns={'uniprot_id':'input_id'}), on='input_id' ,how='left')

res.head(3)

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,254
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;RHEA:36295,REACTION-WITHOUT-EC;3.1.3.90,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,331
2,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-,NO-PREDICTION,96


In [11]:
case_ids = ['XP_065464856.1','XP_025582350.1','XP_025588571.1','XP_025583253.1']
res[res.input_id.isin(case_ids)]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
1097,XP_025582350.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,NO-PREDICTION,522
1740,XP_025583253.1,RHEA:16237,5.2.1.8,2.7.3.4,5.2.1.8,RHEA:16237,5.2.1.8,NO-PREDICTION,5.2.1.8,5.2.1.8,442
5385,XP_025588571.1,RHEA:16237,5.2.1.8,3.4.19.12,5.2.1.8,RHEA:16237,5.2.1.8,5.2.1.8,5.2.1.8,5.2.1.8,432
11536,XP_065464856.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,NO-PREDICTION,478


## 4. 处理结果

In [37]:
res[(res.RXNRECer2EC==res.clean) & (res.RXNRECer2EC==res.ECRECer) & (res.RXNRECer2EC==res.MSA_EC) & (res.RXNRECer2EC==res.MSA_EC) & (res.RXNRECer2EC==res.deepec) & (res.RXNRECer2EC==res.catfam)& (res.RXNRECer2EC==res.PRIAM)]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
132,XP_025580967.2,RHEA:20301,6.1.1.19,6.1.1.19,6.1.1.19,RHEA:20301,6.1.1.19,6.1.1.19,6.1.1.19,6.1.1.19,635
160,XP_025581006.1,RHEA:18561,1.3.1.70,1.3.1.70,1.3.1.70,RHEA:18561,1.3.1.70,1.3.1.70,1.3.1.70,1.3.1.70,485
189,XP_025581042.1,RHEA:21524,2.3.3.13,2.3.3.13,2.3.3.13,RHEA:21524,2.3.3.13,2.3.3.13,2.3.3.13,2.3.3.13,616
242,XP_025581123.1,RHEA:18585,5.3.1.1,5.3.1.1,5.3.1.1,RHEA:18585,5.3.1.1,5.3.1.1,5.3.1.1,5.3.1.1,247
283,XP_025581190.1,RHEA:23352,2.6.1.19,2.6.1.19,2.6.1.19,RHEA:23352,2.6.1.19,2.6.1.19,2.6.1.19,2.6.1.19,459
...,...,...,...,...,...,...,...,...,...,...,...
12473,XP_065465797.1,RHEA:22452,3.6.1.10,3.6.1.10,3.6.1.10,RHEA:22452,3.6.1.10,3.6.1.10,3.6.1.10,3.6.1.10,701
12477,XP_065465801.1,RHEA:25249;RHEA:27654,2.2.1.6,2.2.1.6,2.2.1.6,RHEA:25249;RHEA:27654,2.2.1.6,2.2.1.6,2.2.1.6,2.2.1.6,669
12558,XP_065465882.1,RHEA:58196,3.5.1.98,3.5.1.98,3.5.1.98,RHEA:58196,3.5.1.98,3.5.1.98,3.5.1.98,3.5.1.98,649
12566,XP_065465890.1,RHEA:20557,3.5.1.5,3.5.1.5,3.5.1.5,RHEA:20557,3.5.1.5,3.5.1.5,3.5.1.5,3.5.1.5,837


In [39]:
res[~res.RXNRECer.str.contains('-')]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;RHEA:36295,REACTION-WITHOUT-EC;3.1.3.90,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,331
4,XP_025580792.2,RHEA:48940;RHEA:11312;RHEA:14025,3.8.1.8;3.4.13.19;3.5.4.24,3.5.4.40,3.8.1.8,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,453
5,XP_025580793.1,RHEA:33807;RHEA:46920;RHEA:33227,4.1.1.102,4.1.1.61,4.1.1.-,RHEA:33227;RHEA:46920;RHEA:33807,4.1.1.102,NO-PREDICTION,-,NO-PREDICTION,530
6,XP_025580794.1,RHEA:37743,2.5.1.129,2.5.1.129,2.5.1.129,RHEA:37743,2.5.1.129,2.5.1.129,-,NO-PREDICTION,257
7,XP_025580796.1,RHEA:22488;RHEA:24164,1.8.4.12;4.4.1.22,4.4.1.22,1.8.4.12,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,136
...,...,...,...,...,...,...,...,...,...,...,...
12814,XP_065466138.1,RHEA:72767;RHEA:64512;RHEA:13097;RHEA:42852;RH...,3.5.1.4;2.3.1.244;4.2.1.59;REACTION-WITHOUT-EC,2.3.1.85,4.2.1.59,RHEA:42852,2.3.1.244,2.1.1.77,-,NO-PREDICTION,2553
12816,XP_065466140.1,RHEA:14109;RHEA:14105,1.5.1.2,1.5.1.2,1.5.1.2,RHEA:14109;RHEA:14105,1.5.1.2,NO-PREDICTION,1.5.1.2,1.5.1.2,289
12817,XP_065466141.1,RHEA:72775;RHEA:62276;RHEA:18565,2.3.1.161;REACTION-WITHOUT-EC,2.4.1.308,2.3.1.161,-,-,6.2.1.3;6.1.1.5;3.6.4.12,-,NO-PREDICTION,12902
12823,XP_065466147.1,RHEA:19637;RHEA:23688;RHEA:22936,1.5.1.19;3.5.4.2;3.5.1.25,3.5.4.40,3.5.1.25,RHEA:23688,3.5.4.2,NO-PREDICTION,-,NO-PREDICTION,449


In [45]:
rxnpred_num_rxnrecer = len(res[res.RXNRECer!='-'])
rxnpred_num_clean = len(res[res.clean!='-'])
rxnpred_num_ecrecer = len(res[res.ECRECer!='-'])
rxnpred_num_msa_rxn = len(res[(res.MSA_RXN!='-')&(res.MSA_RXN!='NO-PREDICTION')])
rxnpred_num_msa_ec = len(res[(res.MSA_EC!='-')&(res.MSA_EC!='NO-PREDICTION')])
rxnpred_num_catfam = len(res[res.catfam!='-'])
rxnpred_num_deepec = len(res[(res.deepec!='-')&(res.deepec!='NO-PREDICTION')])
rxnpred_num_praim = len(res[(res.PRIAM!='-')&(res.PRIAM!='NO-PREDICTION')])


rxnpred=[['RXNRECer', rxnpred_num_rxnrecer], 
         ['ECRECer', rxnpred_num_ecrecer],
         ['CLEAN', rxnpred_num_clean],
         ['MSA_RXN', rxnpred_num_msa_rxn],
         ['MSA_EC', rxnpred_num_msa_ec],
         ['CatFam', rxnpred_num_catfam],
         ['deepEC', rxnpred_num_deepec],
         ['PRIAM', rxnpred_num_praim]
         ]

rxnpred = pd.DataFrame(rxnpred, columns=['Method', 'predicted_rxn'])
rxnpred

Unnamed: 0,Method,predicted_rxn
0,RXNRECer,3297
1,ECRECer,4536
2,CLEAN,12832
3,MSA_RXN,2750
4,MSA_EC,2638
5,CatFam,1100
6,deepEC,984
7,PRIAM,1100


In [46]:
num_proteins_in_ncbi = len(data_ncbi)
num_proteins_in_trembl = len(data_trembl)

data_trembl_ncbi_common =data_trembl[data_trembl.seq.isin(data_ncbi.seq)]
num_proteins_inter_ncbi_trembl = len(data_trembl_ncbi_common)
num_ec_in_trembl =len(data_trembl[data_trembl.ec!='-'])

data_trembl_ncbi_common_with_function = data_trembl_ncbi_common[~data_trembl_ncbi_common.ec.isnull()]


print(f"Number of proteins in NCBI: {num_proteins_in_ncbi}")
print(f"Number of proteins in TrEMBL: {num_proteins_in_trembl}")
print(f"Records have ECs in TrEMBL: {num_ec_in_trembl}")
print(f"Number of proteins in common - NCBI and TrEMBL: {num_proteins_inter_ncbi_trembl}")
print(f"Number of ECs in common - NCBI and TrEMBL with function: {len(data_trembl_ncbi_common_with_function)}")

Number of proteins in NCBI: 12832
Number of proteins in TrEMBL: 14016
Records have ECs in TrEMBL: 14016
Number of proteins in common - NCBI and TrEMBL: 7064
Number of ECs in common - NCBI and TrEMBL with function: 852


### a-酮戊二酸到谷氨酸的例子

In [47]:
res[res.input_id=='XP_025581669.2']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
617,XP_025581669.2,RHEA:11612,1.4.1.3;1.4.1.4,1.4.1.4,1.4.1.4,RHEA:11612,1.4.1.4,1.4.1.4,1.4.1.4,1.4.1.4,451


### 多种注释方法交集

In [49]:
# List of column names
columns = ['RXNRECer2EC', 'clean', 'ECRECer', 'MSA_EC', 'deepec', 'catfam', 'PRIAM']

# Initialize a dictionary to store the intersection counts
intersection_counts = {}

# Iterate over all possible non-empty subsets (from pairs to all 6 columns)
for k in range(2, 7):  # Check intersections of 2, 3, ..., 6 columns
    for subset in itertools.combinations(columns, k):
        # Create a mask where all columns in the subset are equal
        mask = True
        for col in subset[1:]:
            mask &= (res[subset[0]] == res[col])
        # Count the number of rows where all columns in the subset are equal
        count = len(res[mask])
        intersection_counts[subset] = count

# Print the results
set_res_interset =[]
for subset, count in intersection_counts.items():
    line = [('-').join(subset), count]
    set_res_interset = set_res_interset + [line]
    
methods_interset = pd.DataFrame(set_res_interset, columns=['method', 'ec_interset'])
methods_interset.sort_values(by='ec_interset', ascending=False)

Unnamed: 0,method,ec_interset
19,deepec-PRIAM,11701
4,RXNRECer2EC-catfam,9638
13,ECRECer-catfam,8935
1,RXNRECer2EC-ECRECer,8681
28,RXNRECer2EC-ECRECer-catfam,8366
...,...,...
112,RXNRECer2EC-clean-ECRECer-MSA_EC-deepec-catfam,207
113,RXNRECer2EC-clean-ECRECer-MSA_EC-deepec-PRIAM,207
94,RXNRECer2EC-clean-ECRECer-deepec-catfam,207
115,RXNRECer2EC-clean-ECRECer-deepec-catfam-PRIAM,207


In [51]:
# List of column names
columns = ['RXNRECer2EC', 'clean', 'ECRECer', 'MSA_EC', 'deepec', 'catfam', 'PRIAM']

# Initialize a dictionary to store the intersection counts
intersection_counts = {}

# Iterate over all possible non-empty subsets (from pairs to all 6 columns)
for k in range(2, 7):  # Check intersections of 2, 3, ..., 6 columns
    for subset in itertools.combinations(columns, k):
        # Create a mask where all columns in the subset are equal
        mask = True
        for col in subset[1:]:
            mask &= (res[subset[0]] == res[col])
        # Count the number of rows where all columns in the subset are equal
        count = len(res[mask])
        intersection_counts[subset] = count

# Print the results
set_res_interset =[]
for subset, count in intersection_counts.items():
    line = [('-').join(subset), count]
    set_res_interset = set_res_interset + [line]
    
methods_interset = pd.DataFrame(set_res_interset, columns=['method', 'ec_interset'])
methods_interset.sort_values(by='ec_interset', ascending=False)

Unnamed: 0,method,ec_interset
19,deepec-PRIAM,11701
4,RXNRECer2EC-catfam,9638
13,ECRECer-catfam,8935
1,RXNRECer2EC-ECRECer,8681
28,RXNRECer2EC-ECRECer-catfam,8366
...,...,...
112,RXNRECer2EC-clean-ECRECer-MSA_EC-deepec-catfam,207
113,RXNRECer2EC-clean-ECRECer-MSA_EC-deepec-PRIAM,207
94,RXNRECer2EC-clean-ECRECer-deepec-catfam,207
115,RXNRECer2EC-clean-ECRECer-deepec-catfam-PRIAM,207


In [52]:
res[res.RXNRECer!='-']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;RHEA:36295,REACTION-WITHOUT-EC;3.1.3.90,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,331
4,XP_025580792.2,RHEA:48940;RHEA:11312;RHEA:14025,3.8.1.8;3.4.13.19;3.5.4.24,3.5.4.40,3.8.1.8,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,453
5,XP_025580793.1,RHEA:33807;RHEA:46920;RHEA:33227,4.1.1.102,4.1.1.61,4.1.1.-,RHEA:33227;RHEA:46920;RHEA:33807,4.1.1.102,NO-PREDICTION,-,NO-PREDICTION,530
6,XP_025580794.1,RHEA:37743,2.5.1.129,2.5.1.129,2.5.1.129,RHEA:37743,2.5.1.129,2.5.1.129,-,NO-PREDICTION,257
7,XP_025580796.1,RHEA:22488;RHEA:24164,1.8.4.12;4.4.1.22,4.4.1.22,1.8.4.12,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,136
...,...,...,...,...,...,...,...,...,...,...,...
12814,XP_065466138.1,RHEA:72767;RHEA:64512;RHEA:13097;RHEA:42852;RH...,3.5.1.4;2.3.1.244;4.2.1.59;REACTION-WITHOUT-EC,2.3.1.85,4.2.1.59,RHEA:42852,2.3.1.244,2.1.1.77,-,NO-PREDICTION,2553
12816,XP_065466140.1,RHEA:14109;RHEA:14105,1.5.1.2,1.5.1.2,1.5.1.2,RHEA:14109;RHEA:14105,1.5.1.2,NO-PREDICTION,1.5.1.2,1.5.1.2,289
12817,XP_065466141.1,RHEA:72775;RHEA:62276;RHEA:18565,2.3.1.161;REACTION-WITHOUT-EC,2.4.1.308,2.3.1.161,-,-,6.2.1.3;6.1.1.5;3.6.4.12,-,NO-PREDICTION,12902
12823,XP_065466147.1,RHEA:19637;RHEA:23688;RHEA:22936,1.5.1.19;3.5.4.2;3.5.1.25,3.5.4.40,3.5.1.25,RHEA:23688,3.5.4.2,NO-PREDICTION,-,NO-PREDICTION,449


In [53]:
res['rxnrecer_clean']=res.apply(lambda x: set(x.RXNRECer2EC.split(cfg.SPLITER)) == set(x.clean.split(cfg.SPLITER)), axis=1)
res['rxnrecer_ecrecer']=res.apply(lambda x: set(x.RXNRECer2EC.split(cfg.SPLITER)) == set(x.ECRECer.split(cfg.SPLITER)), axis=1)
res['rxnrecer_msa']=res.apply(lambda x: set(x.RXNRECer2EC.split(cfg.SPLITER)) == set(x.MSA_EC.split(cfg.SPLITER)), axis=1)
res['rxnrecer_deepec']=res.apply(lambda x: set(x.RXNRECer2EC.split(cfg.SPLITER)) == set(x.deepec.split(cfg.SPLITER)), axis=1)
res['rxnrecer_catfam']=res.apply(lambda x: set(x.RXNRECer2EC.split(cfg.SPLITER)) == set(x.catfam.split(cfg.SPLITER)), axis=1)

#### 处理3位EC号

In [59]:
def get_3digit_ec(ec_str):
    if ec_str == '-':
        return '-'
    else:
        ec_array = ec_str.split(cfg.SPLITER)
        
        for idx in range(len(ec_array)):
            if '.' in ec_array[idx]:
                ec_array[idx] = '.'.join(ec_array[idx].split('.')[:-1])
            else:
                ec_array[idx] = ec_array[idx]
        
        res = cfg.SPLITER.join(set(ec_array))
        return res

res['RXNRECer2EC_3digit']=res.RXNRECer2EC.apply(lambda x: get_3digit_ec(x))
res['clean_3digit']=res.clean.apply(lambda x: get_3digit_ec(x))
res['ECRECer_3digit']=res.ECRECer.apply(lambda x: get_3digit_ec(x))
res['MSA_EC_3digit']=res.MSA_EC.apply(lambda x: get_3digit_ec(x))
res['deepec_3digit']=res.deepec.apply(lambda x: get_3digit_ec(x))
res['catfam_3digit']=res.catfam.apply(lambda x: get_3digit_ec(x))         
res['priam_3digit']=res.PRIAM.apply(lambda x: get_3digit_ec(x))         

In [60]:
# List of column names
columns = ['RXNRECer2EC_3digit', 'clean_3digit', 'ECRECer_3digit', 'MSA_EC_3digit', 'deepec_3digit', 'catfam_3digit', 'priam_3digit']

# Initialize a dictionary to store the intersection counts
intersection_counts = {}

# Iterate over all possible non-empty subsets (from pairs to all 6 columns)
for k in range(2, 7):  # Check intersections of 2, 3, ..., 6 columns
    for subset in itertools.combinations(columns, k):
        # Create a mask where all columns in the subset are equal
        mask = True
        for col in subset[1:]:
            mask &= (res[subset[0]] == res[col])
        # Count the number of rows where all columns in the subset are equal
        count = len(res[mask])
        intersection_counts[subset] = count

# Print the results
set_res_interset_3digit =[]
for subset, count in intersection_counts.items():
    line = [('-').join(subset), count]
    set_res_interset_3digit = set_res_interset_3digit + [line]

methods_interset_3digit = pd.DataFrame(set_res_interset_3digit, columns=['method', 'ec_interset'])
methods_interset_3digit.sort_values(by='ec_interset', ascending=False)

Unnamed: 0,method,ec_interset
19,deepec_3digit-priam_3digit,11738
4,RXNRECer2EC_3digit-catfam_3digit,9787
1,RXNRECer2EC_3digit-ECRECer_3digit,9227
13,ECRECer_3digit-catfam_3digit,9024
28,RXNRECer2EC_3digit-ECRECer_3digit-catfam_3digit,8503
...,...,...
97,RXNRECer2EC_3digit-clean_3digit-MSA_EC_3digit-...,285
116,RXNRECer2EC_3digit-clean_3digit-MSA_EC_3digit-...,285
117,RXNRECer2EC_3digit-ECRECer_3digit-MSA_EC_3digi...,285
112,RXNRECer2EC_3digit-clean_3digit-ECRECer_3digit...,283


In [62]:
# 3-digit 版本比较
res['rxnrecer_clean_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.clean_3digit.split(cfg.SPLITER)), 
    axis=1
)
res['rxnrecer_ecrecer_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.ECRECer_3digit.split(cfg.SPLITER)), 
    axis=1
)
res['rxnrecer_msa_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.MSA_EC_3digit.split(cfg.SPLITER)), 
    axis=1
)
res['rxnrecer_deepec_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.deepec_3digit.split(cfg.SPLITER)), 
    axis=1
)
res['rxnrecer_catfam_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.catfam_3digit.split(cfg.SPLITER)), 
    axis=1
)
res['rxnrecer_priam_3digit'] = res.apply(
    lambda x: set(x.RXNRECer2EC_3digit.split(cfg.SPLITER)) == set(x.priam_3digit.split(cfg.SPLITER)), 
    axis=1
)

In [64]:
res.to_feather(f'{cfg.CASE_DIR}fusarium_venenatum/res/methds_3digit_0526.feather')

In [26]:
res.to_excel('res/res_case_fusarium_venenatum.xlsx')

# 找CLEAN注释错误的例子

In [73]:
case_clean_wrong = res[(res.rxnrecer_clean_3digit==False) & (res.RXNRECer=='-')].copy().reset_index(drop=True)
case_clean_wrong

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,254,...,2.7.7,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
1,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-,96,...,2.3.1,-,-,NO-PREDICTION,-,False,True,True,False,True
2,XP_025580789.2,-,-,3.1.1.42,-,-,-,NO-PREDICTION,-,460,...,3.1.1,-,-,NO-PREDICTION,-,False,True,True,False,True
3,XP_025580806.2,-,-,2.7.13.3,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,475,...,2.7.13,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
4,XP_025580809.2,-,-,3.2.2.22,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,177,...,3.2.2,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7561,XP_065466149.1,-,-,2.3.2.27,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,136,...,2.3.2,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7562,XP_065466151.1,-,-,5.4.99.30,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,74,...,5.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7563,XP_065466152.1,-,-,4.2.1.159,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,199,...,4.2.1,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7564,XP_065466153.1,-,-,2.1.1.86,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,60,...,2.1.1,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True


In [84]:
case_clean_wrong.to_feather('middle/case_clean_wrong.feather')

In [74]:
case_clean_wrong[case_clean_wrong.groupby('clean_3digit')['clean_3digit'].transform('count') < 20]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
5,XP_025580812.1,-,-,3.8.1.9,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,271,...,3.8.1,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
15,XP_025580832.1,-,-,3.1.2.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,328,...,3.1.2,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
20,XP_025580849.2,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,499,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
50,XP_025580916.1,-,-,7.2.4.3,2.7.1.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,614,...,7.2.4,2.7.1,NO-PREDICTION,NO-PREDICTION,-,False,False,False,False,True
65,XP_025580953.1,-,-,3.4.11.18,3.4.11.18,-,-,3.4.11.18,3.4.11.18,461,...,3.4.11,3.4.11,-,3.4.11,3.4.11,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7517,XP_065466069.1,-,-,2.8.2.5,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,258,...,2.8.2,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7525,XP_065466089.1,-,-,1.12.2.1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,77,...,1.12.2,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7530,XP_065466094.1,-,-,1.8.3.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,265,...,1.8.3,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
7534,XP_065466100.1,-,-,1.23.5.1,-,-,-,NO-PREDICTION,-,146,...,1.23.5,-,-,NO-PREDICTION,-,False,True,True,False,True


In [79]:
case_clean_wrong[case_clean_wrong.clean_3digit=='3.4.11']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
65,XP_025580953.1,-,-,3.4.11.18,3.4.11.18,-,-,3.4.11.18,3.4.11.18,461,...,3.4.11,3.4.11,-,3.4.11,3.4.11,False,False,True,False,False
340,XP_025581578.2,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,786,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
700,XP_025582496.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,1096,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
916,XP_025582990.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,875,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
1178,XP_025583689.1,-,-,3.4.11.15,3.4.11.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,378,...,3.4.11,3.4.11,NO-PREDICTION,NO-PREDICTION,-,False,False,False,False,True
2352,XP_025586816.2,-,-,3.4.11.18,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,297,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
3437,XP_025589480.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,632,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
4058,XP_025590973.1,-,-,3.4.11.18,3.4.11.18,-,-,3.4.11.18,3.4.11.18,441,...,3.4.11,3.4.11,-,3.4.11,3.4.11,False,False,True,False,False
4564,XP_025592412.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,636,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
5076,XP_025593565.1,-,-,3.4.11.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,787,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True


In [81]:
case_clean_wrong[case_clean_wrong.clean=='3.4.11.2']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
340,XP_025581578.2,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,786,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
700,XP_025582496.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,1096,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
916,XP_025582990.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,875,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
3437,XP_025589480.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,632,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
4564,XP_025592412.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,636,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
5076,XP_025593565.1,-,-,3.4.11.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,787,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
6906,XP_065465146.1,-,-,3.4.11.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,195,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True


In [83]:
res[res.clean=='3.4.11.2']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
555,XP_025581578.2,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,786,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
1198,XP_025582496.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,1096,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
1552,XP_025582990.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,875,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
2349,XP_025584208.2,-;RHEA:19037;RHEA:28286,2.1.1.201;3.3.2.10;-,3.4.11.2,2.1.1.201,RHEA:19037,3.3.2.6,NO-PREDICTION,-,1285,...,3.4.11,2.1.1,3.3.2,NO-PREDICTION,-,False,False,False,False,False
5817,XP_025589145.2,RHEA:19037,3.3.2.10,3.4.11.2,3.4.11.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,880,...,3.4.11,3.4.11,NO-PREDICTION,NO-PREDICTION,-,False,False,False,False,False
6071,XP_025589480.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,632,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
8069,XP_025592412.1,-,-,3.4.11.2,-,-,-,NO-PREDICTION,-,636,...,3.4.11,-,-,NO-PREDICTION,-,False,True,True,False,True
8914,XP_025593565.1,-,-,3.4.11.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,787,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
11822,XP_065465146.1,-,-,3.4.11.2,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,195,...,3.4.11,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True


In [68]:
res[res.clean_3digit=='2.4.99']

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,seq_len,...,clean_3digit,ECRECer_3digit,MSA_EC_3digit,deepec_3digit,catfam_3digit,rxnrecer_clean_3digit,rxnrecer_ecrecer_3digit,rxnrecer_msa_3digit,rxnrecer_deepec_3digit,rxnrecer_catfam_3digit
39,XP_025580849.2,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,499,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
1097,XP_025582350.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,522,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
3660,XP_025586043.1,-,-,2.4.99.20,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,904,...,2.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
4566,XP_025587400.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,493,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
5130,XP_025588202.2,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,494,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
6164,XP_025589614.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,484,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
7275,XP_025591181.1,-,-,2.4.99.20,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,218,...,2.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
11536,XP_065464856.1,-,-,2.4.99.19,-,-,-,NO-PREDICTION,-,478,...,2.4.99,-,-,NO-PREDICTION,-,False,True,True,False,True
11902,XP_065465226.1,-,-,2.4.99.20,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,458,...,2.4.99,-,NO-PREDICTION,NO-PREDICTION,-,False,True,False,False,True
12629,XP_065465953.1,RHEA:22980,2.4.99.18,2.4.99.18,2.4.99.18,RHEA:22980,2.4.99.18,NO-PREDICTION,-,749,...,2.4.99,2.4.99,2.4.99,NO-PREDICTION,-,True,True,True,False,False


In [1]:
res[(~res.rxnrecer_clean) &(res.RXNRECer!='-')]

NameError: name 'res' is not defined

In [154]:
data_trembl[data_trembl.seq.isin(data_ncbi.seq)]

Unnamed: 0,uniprot_id,Reviewed,Entry Name,Protein names,Gene Names,Organism,len,ec,rxn_id,GeneID,Gene Names (ordered locus),PubMed ID,seq
0,A0A2L2SSP6,unreviewed,A0A2L2SSP6_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,2423,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAQAQQIALERLEQVSRGLKSKVSDDVRKRSAVQLRELVVICHRDL...
1,A0A2L2ST09,unreviewed,A0A2L2ST09_9HYPO,Serine/threonine-protein kinase RIO1 (EC 2.7.1...,,Fusarium venenatum,546,2.7.11.1,RHEA:17989 RHEA:46608,,,,MDPAAPHQPPYTYTANQGYEQTEEIPRELQTQRDDGAALDNQDDDN...
2,A0A2L2T2H5,unreviewed,A0A2L2T2H5_9HYPO,Histone acetyltransferase type B catalytic sub...,,Fusarium venenatum,478,2.3.1.48,RHEA:45948,,,,MEDVTPWLSDANEAIQINLLSPSDSGLQHIATFNPRHTYSIFGDEE...
3,A0A2L2TC43,unreviewed,A0A2L2TC43_9HYPO,Pentafunctional AROM polypeptide [Includes: 3-...,,Fusarium venenatum,1568,1.1.1.25; 2.5.1.19; 2.7.1.71; 4.2.1.10; 4.2.3.4,RHEA:21096 RHEA:21256 RHEA:21968 RHEA:13121 RH...,,,,MAQAGGQDPTRISILGEPNIIVDHGLWLNFVIDDLLQNIPTSTYVL...
5,A0A2L2TK64,unreviewed,A0A2L2TK64_9HYPO,non-specific serine/threonine protein kinase (...,,Fusarium venenatum,950,2.7.11.1,RHEA:17989 RHEA:46608,,,,MAGPQESSTSSGSRKSGSRAVGQFNIGSEIGKGSFAQVYLGWHKET...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13974,A0A2L2U5N4,unreviewed,A0A2L2U5N4_9HYPO,N-acetyltransferase domain-containing protein,,Fusarium venenatum,231,,,,,,MSQPSLPYNKYRIRPATYSDVPAVTRLYAWSFGNEPLIEFFFPTRK...
14004,D2JM00,unreviewed,D2JM00_9HYPO,MSF superfamily transporter,FVENE_0001,Fusarium venenatum,545,,,,,19843228,MGKETLDASVSHAESYDVEARVPTKGVHDTTDQARLGAQAEHNLPP...
14005,D2JM01,unreviewed,D2JM01_9HYPO,Acyltransferase,TRI16,Fusarium venenatum,492,,,,,19843228,MPCTQYQRTKMALLSPLDQLNSSFYLRWSLVLQVKDLNKAVGSLSK...
14006,D2JM03,unreviewed,D2JM03_9HYPO,Ubiquitin 3 binding protein But2 C-terminal do...,FVENE_0002,Fusarium venenatum,259,,,,,19843228,MRVLLFATGLSAAIVVVQAGVCKPAYTTTSHSDAVSTIESSATSAS...


In [21]:
res[(~res.MSA_EC.str.contains('NO-PREDICTION'))&(res.MSA_EC!='-')&(res.RXNRECer=='-')]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
29,XP_025580837.2,-,-,6.2.1.8,6.3.2.-,RHEA:32139,6.2.1.-,NO-PREDICTION,-,NO-PREDICTION,825
63,XP_025580885.2,-,-,2.3.1.48,-,RHEA:14357,2.7.1.26,NO-PREDICTION,2.7.1.26,2.7.1.26,698
97,XP_025580928.1,-,-,3.1.2.2,3.1.2.-,RHEA:16781;RHEA:40059;RHEA:30143;RHEA:40111;RH...,3.1.2.-,NO-PREDICTION,-,NO-PREDICTION,170
165,XP_025581011.1,-,-,3.1.3.2,3.1.13.4,RHEA:14445,3.1.4.4,3.2.1.91,-,NO-PREDICTION,626
252,XP_025581138.1,-,-,5.6.2.1,5.99.1.2,RHEA:13065,5.99.1.3;3.6.4.12,5.99.1.2,5.99.1.2,5.99.1.2,659
...,...,...,...,...,...,...,...,...,...,...,...
12562,XP_065465886.1,-,-,1.14.13.25,3.6.4.-,RHEA:20301,6.1.1.19,NO-PREDICTION,6.1.1.19,6.1.1.19,1296
12663,XP_065465987.1,-,-,3.1.4.53,2.7.11.25,RHEA:13609;RHEA:42348,2.7.1.150,NO-PREDICTION,-,NO-PREDICTION,2924
12690,XP_065466014.1,-,-,3.8.1.3,3.3.2.9,RHEA:23900;RHEA:55764;RHEA:44048;RHEA:44044;RH...,3.3.2.9,NO-PREDICTION,-,NO-PREDICTION,125
12715,XP_065466039.1,-,-,2.5.1.18,-,RHEA:12821,4.1.2.4,NO-PREDICTION,4.1.2.4,4.1.2.4,811


In [None]:
res[res.]

Unnamed: 0,input_id,RXNRECer,RXNRECer2EC,clean,ECRECer,MSA_RXN,MSA_EC,deepec,catfam,PRIAM,seq_len
0,XP_025580784.2,-,-,2.7.7.n1,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,254
1,XP_025580785.1,RHEA:55688;RHEA:55696;RHEA:55700;RHEA:36295,REACTION-WITHOUT-EC;3.1.3.90,3.1.4.12,3.1.-.-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,331
2,XP_025580788.2,-,-,2.3.1.48,-,-,-,NO-PREDICTION,-,NO-PREDICTION,96
3,XP_025580789.2,-,-,3.1.1.42,-,-,-,NO-PREDICTION,-,NO-PREDICTION,460
4,XP_025580792.2,RHEA:48940;RHEA:11312;RHEA:14025,3.4.13.19;3.5.4.24;3.8.1.8,3.5.4.40,3.8.1.8,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,453
...,...,...,...,...,...,...,...,...,...,...,...
12827,XP_065466151.1,-,-,5.4.99.30,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,74
12828,XP_065466152.1,-,-,4.2.1.159,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,199
12829,XP_065466153.1,-,-,2.1.1.86,-,NO-PREDICTION,NO-PREDICTION,NO-PREDICTION,-,NO-PREDICTION,60
12830,XP_065466154.1,RHEA:12044;RHEA:17561,3.1.1.34;3.1.1.79;3.1.1.3;3.1.1.7,3.1.1.42,3.1.1.3,RHEA:12044,3.1.1.3,3.1.1.1,-,NO-PREDICTION,571
