# Datasets analysis 

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2025-01-04  



## 1. Import packages

In [8]:
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tkinter import _flatten
from tools import rheatool as rheatool
import plotly.express as px
from collections import OrderedDict
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)

from IPython.display import  HTML
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Load data

In [9]:
ds_rcv = pd.read_feather(cfg.FILE_DS_TRAIN)
ds_rcp = pd.read_feather(cfg.FILE_DS_TEST)
rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)

# 10-fold cross-validation dataset
train_paths = [f'{cfg.DIR_DATASET}validation/fold{i}/train.feather' for i in range(1, 11)]
test_paths = [f'{cfg.DIR_DATASET}validation/fold{i}/valid.feather' for i in range(1, 11)]

dfs_train = [pd.read_feather(p) for p in train_paths]
dfs_test = [pd.read_feather(p) for p in test_paths]


print('loading success')

loading success


## 3. Statistics 

In [11]:

def stat_ds(dsname, name='Dataset'):

    # 1. 基础统计
    num_proteins = len(set(dsname.uniprot_id))
    none_enzyme_records = len(dsname[dsname.reaction_id == "-"])
    enzyme_records = len(dsname[dsname.reaction_id != "-"])

    # 2. Reaction 相关
    reactions = set(_flatten([item.split(';') for item in set(dsname.reaction_id)]))
    num_reactions = len(reactions - {'-'})
    enzyme_rxn_relations = dsname[dsname.reaction_id != '-'].reaction_id.apply(lambda x: len(x.split(';'))).sum()

    # 3. EC 相关
    num_enzymes_have_ec = len(dsname[dsname.ec_number != '-'])
    num_proteins_ec_l1 = len(dsname[dsname.ec_specific_level == 1])
    num_proteins_ec_l2 = len(dsname[dsname.ec_specific_level == 2])
    num_proteins_ec_l3 = len(dsname[dsname.ec_specific_level == 3])
    num_proteins_ec_l4 = len(dsname[dsname.ec_specific_level == 4])

    valid_ecs = dsname.loc[dsname.ec_number != '-', 'ec_number']
    flattened_ecs = _flatten([s.split(',') for s in valid_ecs])
    unique_ecs = sorted(set([e for e in flattened_ecs if e != '-']))

    ecs_df = pd.DataFrame(unique_ecs, columns=['ec'])
    ecs_df['ec_specific_level'] = ecs_df.ec.apply(lambda x: 4 - x.count('-'))

    num_ec = len(ecs_df)
    num_ec_l1 = len(ecs_df[ecs_df.ec_specific_level == 1])
    num_ec_l2 = len(ecs_df[ecs_df.ec_specific_level == 2])
    num_ec_l3 = len(ecs_df[ecs_df.ec_specific_level == 3])
    num_ec_l4 = len(ecs_df[ecs_df.ec_specific_level == 4])

    # 4. 结果组织为 DataFrame
    result = OrderedDict({
        'Dataset': name,
        'Protein_records': num_proteins,
        'None_enzyme_records': none_enzyme_records,
        'Enzyme_records': enzyme_records,
        'Reaction_records': num_reactions,
        'Enzyme-Reaction_relations': enzyme_rxn_relations,
        'Enzymes_with_ec': num_enzymes_have_ec,
        'Proteins_with_l1_ec': num_proteins_ec_l1,
        'Proteins_with_l2_ec': num_proteins_ec_l2,
        'Proteins_with_l3_ec': num_proteins_ec_l3,
        'Proteins_with_l4_ec': num_proteins_ec_l4,
        'Proteins_without_ec': num_proteins - num_enzymes_have_ec,
        'Distinct_ec': num_ec,
        'Distinct_ec_l1': num_ec_l1,
        'Distinct_ec_l2': num_ec_l2,
        'Distinct_ec_l3': num_ec_l3,
        'Distinct_ec_l4': num_ec_l4,
    })

    return pd.DataFrame([result])

#### 3.1 Reading data

### 3.2 _ds_rcp_(24) 

In [57]:
stat_rcv = stat_ds(dsname=ds_rcv, name='ds_rcv')
stat_rcp = stat_ds(dsname=ds_rcp, name='ds_rcp')

pd.concat([stat_rcv, stat_rcp], ignore_index=True)

Unnamed: 0,Dataset,Protein_records,None_enzyme_records,Enzyme_records,Reaction_records,Enzyme-Reaction_relations,Enzymes_with_ec,Proteins_with_l1_ec,Proteins_with_l2_ec,Proteins_with_l3_ec,Proteins_with_l4_ec,Proteins_without_ec,Distinct_ec,Distinct_ec_l1,Distinct_ec_l2,Distinct_ec_l3,Distinct_ec_l4
0,ds_rcv,508587,282920,225667,10478,304013,216525,335,883,11991,203316,292062,4741,16,63,207,4453
1,ds_rcp,13515,10310,3205,2185,5795,5579,629,225,1780,2945,7936,1335,11,45,131,1146


In [19]:
# ===== 统计分析 =====
stats = []

# 原始数据
# stats.append(stat_ds(ds_rcv, name="ds_rcv"))


# 各折训练/测试数据
for i in range(10):
    stats.append(stat_ds(dfs_train[i], name=f"ds_rcv_fold{i+1}_train"))
    stats.append(stat_ds(dfs_test[i], name=f"ds_rcv_fold{i+1}_test"))
# ds_rcp    
stats.append(stat_ds(ds_rcp, name="ds_rcp"))
# 合并
df_stats = pd.concat(stats, ignore_index=True)


df_stats


Unnamed: 0,Dataset,Protein_records,None_enzyme_records,Enzyme_records,Reaction_records,Enzyme-Reaction_relations,Enzymes_with_ec,Proteins_with_l1_ec,Proteins_with_l2_ec,Proteins_with_l3_ec,Proteins_with_l4_ec,Proteins_without_ec,Distinct_ec,Distinct_ec_l1,Distinct_ec_l2,Distinct_ec_l3,Distinct_ec_l4
0,ds_rcv_fold1_train,457729,254621,203108,10208,273904,194882,300,817,10777,182988,262847,4591,16,63,201,4309
1,ds_rcv_fold1_test,50858,28299,22559,4398,30109,21643,35,66,1214,20328,29215,2124,9,27,115,1973
2,ds_rcv_fold2_train,457729,254584,203145,10186,273396,194903,301,781,10806,183015,262826,4603,16,64,200,4322
3,ds_rcv_fold2_test,50858,28336,22522,4690,30617,21622,34,102,1185,20301,29236,2168,6,28,108,2025
4,ds_rcv_fold3_train,457729,254843,202886,10205,273386,194640,291,803,10716,182830,263089,4595,15,64,199,4315
5,ds_rcv_fold3_test,50858,28077,22781,4544,30627,21885,44,80,1275,20486,28973,2190,10,26,118,2036
6,ds_rcv_fold4_train,457729,254616,203113,10200,273495,194868,302,809,10779,182978,262861,4603,16,63,202,4321
7,ds_rcv_fold4_test,50858,28304,22554,4559,30518,21657,33,74,1212,20338,29201,2126,10,26,106,1983
8,ds_rcv_fold5_train,457729,254703,203026,10233,273516,194788,307,787,10815,182879,262941,4592,16,63,202,4309
9,ds_rcv_fold5_test,50858,28217,22641,4560,30497,21737,28,96,1176,20437,29121,2138,6,25,118,1989


## 基于EC的反应分类

| Class | Reaction Catalyzed | Typical Reaction | Enzyme Example(s) with Trivial Name |
|-------|--------------------|------------------|-------------------------------------|
| **EC 1** | Oxidoreductases | Oxidation/reduction reactions; transfer of H and O atoms or electrons from one substance to another | AH + B → A + BH (reduced) <br> A + O → AO (oxidized) | Dehydrogenase, oxidase |
| **EC 2** | Transferases | Transfer of a functional group from one substance to another. The group may be methyl-, acyl-, amino-, or phosphate group | AB + C → A + BC | Transaminase, kinase |
| **EC 3** | Hydrolases | Formation of two products from a substrate by hydrolysis | AB + H2O → AOH + BH | Lipase, amylase, peptidase, phosphatase |
| **EC 4** | Lyases | Non-hydrolytic addition or removal of groups from substrates. C-C, C-N, C-O or C-S bonds may be cleaved | RCOCOOH → RCOH + CO2 <br> [X-A+B-Y] → [A=B + X-Y] | Decarboxylase |
| **EC 5** | Isomerases | Intramolecule rearrangement, i.e., isomerization changes within a single molecule | ABC → BCA | Isomerase, mutase |
| **EC 6** | Ligases | Join together two molecules by synthesis of new C-O, C-S, C-N or C-C bonds with simultaneous breakdown of ATP | X + Y + ATP → XY + ADP + Pi | Synthetase |
| **EC 7** | Translocases | Catalyze the movement of ions or molecules across membranes or their separation within membranes | | Transporter |


In [20]:
rxn_class_dict = {'1':'Oxidoreductases', '2':'Transferases', '3':'Hydrolases','4':'Lyases', '5':'Isomerases', '6':'Ligases', '7':'Translocases', '-':'None-Enzyme'}

def get_rxn_class_distribution (ecdf):
    ecdf = ecdf.assign(ec_number=ecdf['ec_number'].str.split(',')).explode('ec_number')
    ecdf['rxn_class']=ecdf.ec_number.apply(lambda x: f'{rxn_class_dict.get(x.split(".")[0])}')
    rxn_class_counts  = ecdf.rxn_class.value_counts().reset_index()
    
    return rxn_class_counts


def get_rxn_distribution (ecdf):
    
    ecdf = ecdf.assign(reaction_id=ecdf['reaction_id'].str.split(';')).explode('reaction_id')
    # ecdf['rxn_class']=ecdf.reaction_id.apply(lambda x: f'{x.split(".")[0]}')
    rxn_class_counts  = ecdf.reaction_id.value_counts().reset_index()
    
    return rxn_class_counts


In [218]:
train_enzyme_labels = ['None-enzyme', 'Enzyme']
train_enzyme_values = [282920, 225667]

test_enzyme_labels = ['None-enzyme', 'Enzyme']
test_enzyme_values = [10310, 3205]

rxn_class_counts_train = get_rxn_class_distribution(train[['uniprot_id', 'ec_number']])
rxn_class_counts_test = get_rxn_class_distribution(test[['uniprot_id', 'ec_number']])

rxn_class_counts_train = rxn_class_counts_train[rxn_class_counts_train.rxn_class!='None-Enzyme']
rxn_class_counts_test = rxn_class_counts_test[rxn_class_counts_test.rxn_class!='None-Enzyme']


# 创建饼图对象
fig1_1_1 = go.Pie(labels=train_enzyme_labels, values=train_enzyme_values, textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_2 = go.Pie(labels=test_enzyme_labels, values=test_enzyme_values, textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_3 = go.Pie(labels=rxn_class_counts_train.rxn_class, values=rxn_class_counts_train['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_4 = go.Pie(labels=rxn_class_counts_test.rxn_class, values=rxn_class_counts_test['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')


rxn_class_counts_test

Unnamed: 0,rxn_class,count
1,Transferases,1939
2,Oxidoreductases,1707
3,Hydrolases,1496
4,Lyases,679
5,Ligases,247
6,Isomerases,204
7,Translocases,100


In [219]:
# 创建子图
fig1 = make_subplots(
    rows=1, cols=4, 
    specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]],
    subplot_titles=['Enzyme and Non-enzyme Distribution - Train', 'Enzyme and Non-enzyme Distribution - Test', 'RXN Class Distribution - Train', 'RXN Class Distribution - Test']
)


# Add pie charts to each subplot
fig1.add_trace(fig1_1_1, row=1, col=1)
fig1.add_trace(fig1_1_2 , row=1, col=2)
fig1.add_trace(fig1_1_3, row=1, col=3)
fig1.add_trace(fig1_1_4, row=1, col=4)

# Update layout to control the size
fig1.update_layout(
    width=1950,  # total width of the figure
    height=560,  # total height of the figure
    margin=dict(t=20, b=10, l=0, r=40),  # margins around the figure
    font=dict(
        family='Times New Roman',  # 设置字体为 Times New Roman
        size=14  # 设置字体大小
    )
)

# Show the figure
fig1.show()



In [176]:
rxn_counts_train = get_rxn_distribution (train[['uniprot_id','reaction_id']])
rxn_counts_test = get_rxn_distribution (test[['uniprot_id','reaction_id']])

rxn_counts_train_show = rxn_counts_train[(rxn_counts_train['count']>500) &(rxn_counts_train.reaction_id!='-')]
rxn_counts_test_show = rxn_counts_test[(rxn_counts_test['count']>10) &(rxn_counts_test.reaction_id!='-')]

# 创建饼图对象

fig2_1_1 = go.Pie(labels=rxn_counts_train_show.reaction_id, values=rxn_counts_train_show['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig2_1_2 = go.Pie(labels=rxn_counts_test_show.reaction_id, values=rxn_counts_test_show['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')

In [177]:
# 创建子图
fig2 = make_subplots(
    rows=1, cols=2, 
    specs=[[{'type': 'domain'}, {'type': 'domain'}]],
    subplot_titles=['RXN Distribution - Train', 'RXN Distribution - Test']
)


# Add pie charts to each subplot
fig2.add_trace(fig2_1_1, row=1, col=1)
fig2.add_trace(fig2_1_2 , row=1, col=2)


# Update layout to control the size
fig2.update_layout(
    width=1950,  # total width of the figure
    height=800,  # total height of the figure
    margin=dict(t=20, b=10, l=0, r=40),  # margins around the figure
    font=dict(
        family='Times New Roman',  # 设置字体为 Times New Roman
        size=14  # 设置字体大小
    )
)

# Show the figure
fig2.show()


In [180]:
rxn_counts_test = rxn_counts_test.rename(columns={'count': 'count_test'})
rxn_counts_test

Unnamed: 0,reaction_id,count_test
0,-,10310
1,RHEA:17989,138
2,RHEA:46608,137
3,RHEA:13065,78
4,RHEA:21248,69
...,...,...
2181,RHEA:69667,1
2182,RHEA:69663,1
2183,RHEA:69655,1
2184,RHEA:69707,1


In [184]:
rxn_counts_test=rxn_counts_test.merge(rxn_counts_train, on='reaction_id', how='left').rename(columns={'count': 'count_train'})
rxn_counts_test

Unnamed: 0,reaction_id,count_test,count_train
0,-,10310,282920
1,RHEA:17989,138,3510
2,RHEA:46608,137,3495
3,RHEA:13065,78,4222
4,RHEA:21248,69,4749
...,...,...,...
2181,RHEA:69667,1,3
2182,RHEA:69663,1,3
2183,RHEA:69655,1,3
2184,RHEA:69707,1,3


In [224]:
rxn_counts_test.to_feather('/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/rxn_stats.feather')

In [220]:
rxn_counts_train

Unnamed: 0,reaction_id,count
0,-,282920
1,RHEA:21248,4749
2,RHEA:13065,4222
3,RHEA:17989,3510
4,RHEA:46608,3495
...,...,...
10474,RHEA:26093,1
10475,RHEA:26097,1
10476,RHEA:25844,1
10477,RHEA:25848,1


In [3]:
def load_10folds_data(type='test'):
    if type == 'test':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/valid.feather' for fold_num in range(10)]
    if type =='train':
        file_path = [f'{cfg.DIR_DATASET}validation/fold{fold_num+1}/train.feather' for fold_num in range(10)]
    res = [pd.read_feather(path)[['uniprot_id','seq','reaction_id']].rename(columns={'reaction_id': 'rxn_groundtruth'}) for path in file_path]
    return res


print('Loading 10-folds  data ...' )
data_trian = load_10folds_data(type='train')
data_test = load_10folds_data(type='test')

Loading 10-folds  data ...


In [11]:
axp = data_test[0]
axp.head(2)

Unnamed: 0,uniprot_id,seq,rxn_groundtruth
105768,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-
195319,C1AQW9,MRTPCSQHRRDRPSAIGSQLPDADTLDTRQPPLQEIPISSFADKTF...,RHEA:19669


In [12]:
axp[axp.rxn_groundtruth=='-']

Unnamed: 0,uniprot_id,seq,rxn_groundtruth
105768,Q9UYB6,MLPDRVLEILNEMKAERIRGATWLARKGAEAFLALAEELDEALLED...,-
135884,P64647,MALFSKILIFYVIGVNISFVIIWFISHEKTHIRLLSAFLVGITWPM...,-
53044,P53567,MSKISQQNSTPGVNGISVIHTQAHASGLQQVPQLVPAGPGGGGKAV...,-
403979,P53866,MAKRHSHYQGSRRRHARGSNSKKAGRGNAKGIQGRKIKKKPTPTNS...,-
169618,Q8Z495,MNMDIEARVKKVITSCIAVDVDSINGQTHLVEDLYADSLDLIDIVF...,-
...,...,...,...
401635,P40700,MGNDISLIALLAFSTLLPFIIASGTCFVKFSIVFVMVRNALGLQQI...,-
72895,B2A826,MLVLTRKQNESIMIGDDIEITVVGTEGDKVRLGIKAPKDVEIHRAE...,-
459889,Q9SCB9,MATRYWIAALPVADDNVAAGKTALWARLQEAISRHSFDTPLYRFTV...,-
146797,P38647,MISASRAAAARLVGTAASRSPAAARPQDGWNGLSHEAFRFVSRRDY...,-


In [14]:
28299 /50858

0.5564316331747218