# iML1515

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-11-27 


## 1. Import packages

In [63]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../../')
sys.path.insert(1,'../methods/active-learning/')
from config import conf as cfg
from tools import uniprottool as uptool
from tools import  bioFunctionLib as bfl
from tqdm import tqdm
import re
from IPython.display import display_markdown
from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)
from tkinter import _flatten
import json
import pandas as pd


FIRST_TIME_RUN = False # For the initial run, please set this flag to True. This will allow the program to download data from UniProt and RHEA, which may take longer depending on your internet speed.

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 下载bigg数据

In [2]:
### 4.2  对比IML1515 gene
# 下载模型文件
if FIRST_TIME_RUN:
    ! wget -c http://bigg.ucsd.edu/static/models/iML1515.json
    ! wget -c http://bigg.ucsd.edu/static/namespace/bigg_models_reactions.txt
    
    
with open('iML1515.json', 'r', encoding='utf-8') as f:
    iml1515 = json.load(f)
    
iml1515_gene_ids = [gene['id'] for gene in iml1515['genes']]
print(f'Gene ids in iML1515: {len(iml1515_gene_ids)}')

rxn_bigg = pd.read_csv('bigg_models_reactions.txt', sep='\t')
rxn_bigg.head(3)

Gene ids in iML1515: 1516


Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob


## 3. 提取RHHEA 映射关系

In [None]:
# 获取bigg数据库中rxn的rhea数据库链接
def get_bigg2rhea_link(link_str):
    if 'RHEA' not in str(link_str):
        return '-'
    else:
        rhea_ids = [f'RHEA:{item.rsplit("/", 1)[-1]}' for item in link_str.split('; ') if 'RHEA' in item]
        rhea_ids = [item.split('#')[0] if '#' in item else item for item in rhea_ids]
        rhea_ids = list(set(rhea_ids))
        return rhea_ids
    
    
get_bigg2rhea_link(link_str=rxn_bigg.database_links[0])

['RHEA:35072', 'RHEA:35073', 'RHEA:35071', 'RHEA:35074']

In [57]:
# 获取bigg数据库中rxn的rhea数据库链接
rxn_bigg['rhea_link'] = rxn_bigg.database_links.parallel_apply(lambda x: get_bigg2rhea_link(x))
cross_ref_bigg2rhea =rxn_bigg[rxn_bigg.rhea_link!='-'].reset_index(drop=True)
cross_ref_bigg2rhea = cross_ref_bigg2rhea[['bigg_id', 'rhea_link']]

## 4. 构建 rheaid2biggid字典, biggid2rheaid字典

In [None]:
# 构造 biggid2rheaid 字典
biggid2rheaid = cross_ref_bigg2rhea.set_index('bigg_id')['rhea_link'].to_dict()

# 构造 rheaid2biggid 字典
rheaid2biggid = {}
for bigg_id, rhea_ids in biggid2rheaid.items():
    for rhea_id in rhea_ids:
        rheaid2biggid.setdefault(rhea_id, []).append(bigg_id)

# 保存字典
os.makedirs(cfg.DIR_DICT, exist_ok=True)
with open(f'{cfg.DIR_DICT}dict_biggid2rheaid.json', 'w') as f:
    json.dump(biggid2rheaid, f, indent=4)
    
with open(f'{cfg.DIR_DICT}dict_rheaid2biggid.json', 'w') as f:
    json.dump(rheaid2biggid, f, indent=4)

In [56]:
rheaid2biggid

{'RHEA:35072': ['DM_4crsol_c', 'CREStex', 'EX_4crsol_e'],
 'RHEA:35073': ['DM_4crsol_c', 'CREStex', 'EX_4crsol_e'],
 'RHEA:35071': ['DM_4crsol_c', 'CREStex', 'EX_4crsol_e'],
 'RHEA:35074': ['DM_4crsol_c', 'CREStex', 'EX_4crsol_e'],
 'RHEA:34951': ['EX_3hpp_e', '3HPPtex'],
 'RHEA:34952': ['EX_3hpp_e', '3HPPtex'],
 'RHEA:34953': ['EX_3hpp_e', '3HPPtex'],
 'RHEA:34954': ['EX_3hpp_e', '3HPPtex'],
 'RHEA:35127': ['EX_LalaDglu_e', 'LALADGLUtex'],
 'RHEA:35128': ['EX_LalaDglu_e', 'LALADGLUtex'],
 'RHEA:35129': ['EX_LalaDglu_e', 'LALADGLUtex'],
 'RHEA:35130': ['EX_LalaDglu_e', 'LALADGLUtex'],
 'RHEA:35023': ['EX_arbt_e', 'ARBTtex'],
 'RHEA:35026': ['EX_arbt_e', 'ARBTtex'],
 'RHEA:35025': ['EX_arbt_e', 'ARBTtex'],
 'RHEA:35024': ['EX_arbt_e', 'ARBTtex'],
 'RHEA:28458': ['EX_btn_e', 'BTNtex', 'BTNtn', 'BTNTe'],
 'RHEA:28461': ['EX_btn_e', 'BTNtex', 'BTNtn', 'BTNTe'],
 'RHEA:28460': ['EX_btn_e', 'BTNtex', 'BTNtn', 'BTNTe'],
 'RHEA:28459': ['EX_btn_e', 'BTNtex', 'BTNtn', 'BTNTe'],
 'RHEA:29674': [

In [61]:
rxn_bigg[rxn_bigg.bigg_id.isin(['DM_4crsol_c', 'CREStex', 'EX_4crsol_e'])]

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,rhea_link
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c,"[RHEA:35072, RHEA:35073, RHEA:35071, RHEA:35074]"
2683,CREStex,P-cresol transport vis diffusion (extra-organi...,4crsol_p <-> 4crsol_e,iAF987,RHEA: http://identifiers.org/rhea/35071; RHEA:...,CREStex,"[RHEA:35072, RHEA:35073, RHEA:35071, RHEA:35074]"
16669,EX_4crsol_e,P-Cresol exchange,4crsol_e <->,iAF987,RHEA: http://identifiers.org/rhea/35071; RHEA:...,EX_4crsol_e,"[RHEA:35072, RHEA:35073, RHEA:35071, RHEA:35074]"
