# Datasets analysis 

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-05-30  



## 1. Import packages

In [1]:
import pandas as pd
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
from tkinter import _flatten
from tools import rheatool as rheatool
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)

from IPython.display import  HTML
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Load data

In [2]:
train = pd.read_feather(cfg.FILE_DS_TRAIN)
test = pd.read_feather(cfg.FILE_DS_TEST)
rhea = pd.read_feather(cfg.FILE_DS_RHEA_REACTIONS)
print('loading success')

loading success


## 3. Statistics 

#### 3.1 Trainning Set

In [3]:
def get_ec_level(ec):
    if ec=='0':
        return 0

def stat_ds(dsname):

    #proteins
    num_proteins = len(set(dsname.uniprot_id))
    print( f'Protein records: \t\t{num_proteins}')
    print( f'None-enzyme records: \t\t{len(dsname[dsname.reaction_id=="-"])}') 
    print( f'Enzyme records: \t\t{len(dsname[dsname.reaction_id!="-"])}\n') 
    
    
    #reactions
    reactions = set(_flatten([item.split(';') for item in set(dsname.reaction_id)]))
    num_reactions = len(reactions) - 1 #remove reaction count for '-'
    print( f'Reaction records: \t\t{num_reactions}\n')
    
    #protein reaction realation
    relations = dsname[(dsname.reaction_id!='-')].reaction_id.apply(lambda x:len(x.split(';'))).sum()
    
    print(f'Enzyme-Reaction realtions: \t{relations}\n')
        
    #ec
    num_enzymes_have_ec = len(dsname[dsname.ec_number!='-'])
    num_proteins_ec_l1 = len(dsname[dsname.ec_specific_level==1])
    num_proteins_ec_l2 = len(dsname[dsname.ec_specific_level==2])
    num_proteins_ec_l3 = len(dsname[dsname.ec_specific_level==3])
    num_proteins_ec_l4 = len(dsname[dsname.ec_specific_level==4])

    ecs = pd.DataFrame(list(set(_flatten([item.split(',') for item in set(dsname[train.ec_number!='-'].ec_number)]))), columns=['ec'])
    ecs = ecs[ecs.ec!='-'].reset_index(drop=True)
    ecs['ec_specific_level'] = ecs.ec.apply(lambda x: 4-x.count('-'))

    num_ec = len(ecs) 
    num_ec_l4 = len(ecs[ecs.ec_specific_level==4]) 
    num_ec_l3 = len(ecs[ecs.ec_specific_level==3]) 
    num_ec_l2 = len(ecs[ecs.ec_specific_level==2]) 
    num_ec_l1 = len(ecs[ecs.ec_specific_level==1]) 

    # reactions = list(set(_flatten([item.split(';') for item in set(dsname[dsname.ec_number!='-'].reaction_id)])))

    print( f'Enzymes  with ec: \t{num_enzymes_have_ec}')

    print( f'protein records with ec: \t{num_enzymes_have_ec}, (l1_ec={num_proteins_ec_l1}, l2_ec={num_proteins_ec_l2}, l3_ec={num_proteins_ec_l3}, l4_ec={num_proteins_ec_l4}) \n\
protein records without ec: \t{num_proteins-num_enzymes_have_ec} \n\n\
distinc ec numbers: \t\t{num_ec}, (l1ec={num_ec_l1},l2ec={num_ec_l2},l3ec={num_ec_l3},l4ec={num_ec_l4})\n\n\
reactions: \t\t\t{num_reactions} \n\
    ')
    
    
print('===================================  trainning set  ========================================')
stat_ds(dsname=train)

Protein records: 		508587
None-enzyme records: 		282920
Enzyme records: 		225667

Reaction records: 		10478

Enzyme-Reaction realtions: 	304013

Enzymes  with ec: 	216525
protein records with ec: 	216525, (l1_ec=335, l2_ec=883, l3_ec=11991, l4_ec=203316) 
protein records without ec: 	292062 

distinc ec numbers: 		4290, (l1ec=6,l2ec=22,l3ec=127,l4ec=4135)

reactions: 			10478 
    


### 3.2 Testing set

In [4]:
print('====================================  testing set  =========================================')
stat_ds(dsname=test)

Protein records: 		13515
None-enzyme records: 		10310
Enzyme records: 		3205

Reaction records: 		2185

Enzyme-Reaction realtions: 	5795

Enzymes  with ec: 	5579
protein records with ec: 	5579, (l1_ec=629, l2_ec=225, l3_ec=1780, l4_ec=2945) 
protein records without ec: 	7936 

distinc ec numbers: 		899, (l1ec=7,l2ec=17,l3ec=89,l4ec=786)

reactions: 			2185 
    


  ecs = pd.DataFrame(list(set(_flatten([item.split(',') for item in set(dsname[train.ec_number!='-'].ec_number)]))), columns=['ec'])


## 基于EC的反应分类

| Class | Reaction Catalyzed | Typical Reaction | Enzyme Example(s) with Trivial Name |
|-------|--------------------|------------------|-------------------------------------|
| **EC 1** | Oxidoreductases | Oxidation/reduction reactions; transfer of H and O atoms or electrons from one substance to another | AH + B → A + BH (reduced) <br> A + O → AO (oxidized) | Dehydrogenase, oxidase |
| **EC 2** | Transferases | Transfer of a functional group from one substance to another. The group may be methyl-, acyl-, amino-, or phosphate group | AB + C → A + BC | Transaminase, kinase |
| **EC 3** | Hydrolases | Formation of two products from a substrate by hydrolysis | AB + H2O → AOH + BH | Lipase, amylase, peptidase, phosphatase |
| **EC 4** | Lyases | Non-hydrolytic addition or removal of groups from substrates. C-C, C-N, C-O or C-S bonds may be cleaved | RCOCOOH → RCOH + CO2 <br> [X-A+B-Y] → [A=B + X-Y] | Decarboxylase |
| **EC 5** | Isomerases | Intramolecule rearrangement, i.e., isomerization changes within a single molecule | ABC → BCA | Isomerase, mutase |
| **EC 6** | Ligases | Join together two molecules by synthesis of new C-O, C-S, C-N or C-C bonds with simultaneous breakdown of ATP | X + Y + ATP → XY + ADP + Pi | Synthetase |
| **EC 7** | Translocases | Catalyze the movement of ions or molecules across membranes or their separation within membranes | | Transporter |


In [216]:
rxn_class_dict = {'1':'Oxidoreductases', '2':'Transferases', '3':'Hydrolases','4':'Lyases', '5':'Isomerases', '6':'Ligases', '7':'Translocases', '-':'None-Enzyme'}

def get_rxn_class_distribution (ecdf):
    
    ecdf = ecdf.assign(ec_number=ecdf['ec_number'].str.split(',')).explode('ec_number')
    ecdf['rxn_class']=ecdf.ec_number.apply(lambda x: f'{rxn_class_dict.get(x.split(".")[0])}')
    rxn_class_counts  = ecdf.rxn_class.value_counts().reset_index()
    
    return rxn_class_counts


def get_rxn_distribution (ecdf):
    
    ecdf = ecdf.assign(reaction_id=ecdf['reaction_id'].str.split(';')).explode('reaction_id')
    # ecdf['rxn_class']=ecdf.reaction_id.apply(lambda x: f'{x.split(".")[0]}')
    rxn_class_counts  = ecdf.reaction_id.value_counts().reset_index()
    
    return rxn_class_counts


In [218]:
train_enzyme_labels = ['None-enzyme', 'Enzyme']
train_enzyme_values = [282920, 225667]

test_enzyme_labels = ['None-enzyme', 'Enzyme']
test_enzyme_values = [10310, 3205]

rxn_class_counts_train = get_rxn_class_distribution(train[['uniprot_id', 'ec_number']])
rxn_class_counts_test = get_rxn_class_distribution(test[['uniprot_id', 'ec_number']])

rxn_class_counts_train = rxn_class_counts_train[rxn_class_counts_train.rxn_class!='None-Enzyme']
rxn_class_counts_test = rxn_class_counts_test[rxn_class_counts_test.rxn_class!='None-Enzyme']


# 创建饼图对象
fig1_1_1 = go.Pie(labels=train_enzyme_labels, values=train_enzyme_values, textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_2 = go.Pie(labels=test_enzyme_labels, values=test_enzyme_values, textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_3 = go.Pie(labels=rxn_class_counts_train.rxn_class, values=rxn_class_counts_train['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig1_1_4 = go.Pie(labels=rxn_class_counts_test.rxn_class, values=rxn_class_counts_test['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')


rxn_class_counts_test

Unnamed: 0,rxn_class,count
1,Transferases,1939
2,Oxidoreductases,1707
3,Hydrolases,1496
4,Lyases,679
5,Ligases,247
6,Isomerases,204
7,Translocases,100


In [219]:
# 创建子图
fig1 = make_subplots(
    rows=1, cols=4, 
    specs=[[{'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}, {'type': 'domain'}]],
    subplot_titles=['Enzyme and Non-enzyme Distribution - Train', 'Enzyme and Non-enzyme Distribution - Test', 'RXN Class Distribution - Train', 'RXN Class Distribution - Test']
)


# Add pie charts to each subplot
fig1.add_trace(fig1_1_1, row=1, col=1)
fig1.add_trace(fig1_1_2 , row=1, col=2)
fig1.add_trace(fig1_1_3, row=1, col=3)
fig1.add_trace(fig1_1_4, row=1, col=4)

# Update layout to control the size
fig1.update_layout(
    width=1950,  # total width of the figure
    height=560,  # total height of the figure
    margin=dict(t=20, b=10, l=0, r=40),  # margins around the figure
    font=dict(
        family='Times New Roman',  # 设置字体为 Times New Roman
        size=14  # 设置字体大小
    )
)

# Show the figure
fig1.show()



In [176]:
rxn_counts_train = get_rxn_distribution (train[['uniprot_id','reaction_id']])
rxn_counts_test = get_rxn_distribution (test[['uniprot_id','reaction_id']])

rxn_counts_train_show = rxn_counts_train[(rxn_counts_train['count']>500) &(rxn_counts_train.reaction_id!='-')]
rxn_counts_test_show = rxn_counts_test[(rxn_counts_test['count']>10) &(rxn_counts_test.reaction_id!='-')]

# 创建饼图对象

fig2_1_1 = go.Pie(labels=rxn_counts_train_show.reaction_id, values=rxn_counts_train_show['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')
fig2_1_2 = go.Pie(labels=rxn_counts_test_show.reaction_id, values=rxn_counts_test_show['count'], textinfo='percent+label', textposition='inside', hoverinfo='label+percent+value')

In [177]:
# 创建子图
fig2 = make_subplots(
    rows=1, cols=2, 
    specs=[[{'type': 'domain'}, {'type': 'domain'}]],
    subplot_titles=['RXN Distribution - Train', 'RXN Distribution - Test']
)


# Add pie charts to each subplot
fig2.add_trace(fig2_1_1, row=1, col=1)
fig2.add_trace(fig2_1_2 , row=1, col=2)


# Update layout to control the size
fig2.update_layout(
    width=1950,  # total width of the figure
    height=800,  # total height of the figure
    margin=dict(t=20, b=10, l=0, r=40),  # margins around the figure
    font=dict(
        family='Times New Roman',  # 设置字体为 Times New Roman
        size=14  # 设置字体大小
    )
)

# Show the figure
fig2.show()


In [180]:
rxn_counts_test = rxn_counts_test.rename(columns={'count': 'count_test'})
rxn_counts_test

Unnamed: 0,reaction_id,count_test
0,-,10310
1,RHEA:17989,138
2,RHEA:46608,137
3,RHEA:13065,78
4,RHEA:21248,69
...,...,...
2181,RHEA:69667,1
2182,RHEA:69663,1
2183,RHEA:69655,1
2184,RHEA:69707,1


In [184]:
rxn_counts_test=rxn_counts_test.merge(rxn_counts_train, on='reaction_id', how='left').rename(columns={'count': 'count_train'})
rxn_counts_test

Unnamed: 0,reaction_id,count_test,count_train
0,-,10310,282920
1,RHEA:17989,138,3510
2,RHEA:46608,137,3495
3,RHEA:13065,78,4222
4,RHEA:21248,69,4749
...,...,...,...
2181,RHEA:69667,1,3
2182,RHEA:69663,1,3
2183,RHEA:69655,1,3
2184,RHEA:69707,1,3


In [224]:
rxn_counts_test.to_feather('/hpcfs/fhome/shizhenkun/codebase/preaction/results240614/rxn_stats.feather')

In [220]:
rxn_counts_train

Unnamed: 0,reaction_id,count
0,-,282920
1,RHEA:21248,4749
2,RHEA:13065,4222
3,RHEA:17989,3510
4,RHEA:46608,3495
...,...,...
10474,RHEA:26093,1
10475,RHEA:26097,1
10476,RHEA:25844,1
10477,RHEA:25848,1


In [223]:
rxn_counts_test[rxn_counts_test.reaction_id=='RHEA:14721']

Unnamed: 0,reaction_id,count_test,count_train
