In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
models_list = ['models/iMM904_Model_ver2.csv', 'models/iYO844_Model.csv']
mvecs_list = ['models/iMM904_mvec_ver2.csv', 'models/iYO844_mvec.csv']

In [3]:
model_1 = pd.read_csv(models_list[0])
mvec_1 = pd.read_csv(mvecs_list[0])
model_2 = pd.read_csv(models_list[1])
mvec_2 = pd.read_csv(mvecs_list[1])
mvec = pd.read_csv('models/mvec_withCompartment.csv')

In [4]:
mvec

Unnamed: 0,Mol,SMILES,0,1,2,3,4,5,6,7,...,118,119,120,121,122,123,124,125,126,127
0,10fthf_c,[H]C(=O)N(C[C@H]1CNc2[nH]c(N)nc(=O)c2N1)c1ccc(...,0.065057,-0.005264,-0.128438,0.077335,0.016711,-0.126336,-0.044556,-0.114117,...,0.022836,0.009202,0.021724,0.033271,0.036451,-0.042586,-0.004259,0.096110,0.083563,-0.038996
1,10fthf_m,[H]C(=O)N(C[C@H]1CNc2[nH]c(N)nc(=O)c2N1)c1ccc(...,0.060535,-0.005715,-0.136524,0.082420,0.027852,-0.125344,-0.041909,-0.117448,...,0.017983,0.016098,0.022470,0.037950,0.036086,-0.045248,-0.008820,0.088378,0.084593,-0.045721
2,12d3k5m_c,CSCCC(=O)/C(O)=C/O,0.058052,-0.014663,-0.127484,0.083216,0.028459,-0.115740,-0.033855,-0.111919,...,0.019250,0.015671,0.010380,0.022791,0.024664,-0.048111,0.004618,0.077801,0.065135,-0.031790
3,12dgr120_c,CCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCC,0.052606,-0.023302,-0.106628,0.061480,0.013646,-0.095978,-0.026610,-0.100317,...,0.019409,0.008847,0.017465,0.022450,0.016365,-0.043808,-0.001032,0.076966,0.063997,-0.034493
4,12dgr120_p,CCCCCCCCCCCC(=O)OCC(CO)OC(=O)CCCCCCCCCCC,0.048057,-0.021575,-0.099854,0.059156,0.018804,-0.086642,-0.023192,-0.089562,...,0.020678,0.009553,0.011394,0.015494,0.017015,-0.026423,0.004385,0.065725,0.055068,-0.035507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,xylu__D_c,OC[C@@H](O)[C@H](O)C(=O)CO,0.046301,-0.014360,-0.119205,0.076034,0.018383,-0.100588,-0.030845,-0.107983,...,0.018968,0.017931,0.014990,0.022433,0.036229,-0.038787,-0.005255,0.080818,0.076502,-0.043820
1736,xylu__L_c,C(O)[C@H](O)[C@@H](O)C(=O)CO,0.049939,-0.010038,-0.114556,0.071983,0.027549,-0.108875,-0.033477,-0.108700,...,0.013826,0.016927,0.011784,0.022522,0.027971,-0.048801,0.006316,0.073639,0.075610,-0.038666
1737,zn2_c,[Zn++],0.044306,-0.019651,-0.116815,0.071490,0.016455,-0.103330,-0.029461,-0.106962,...,0.008773,0.016569,0.016911,0.030801,0.034773,-0.037035,-0.007140,0.082875,0.076580,-0.032300
1738,zn2_e,[Zn++],0.052911,-0.015405,-0.114286,0.063784,0.018049,-0.110592,-0.038190,-0.093441,...,0.022706,0.009777,0.019027,0.026256,0.026743,-0.044916,0.004926,0.074004,0.066759,-0.035790


## Index the nodes

### Index enzyme

In [5]:
es_1 = model_1['Rule'].unique()     # enzymes 1
es_2 = model_2['BSU'].unique()      # enzymes 2
print(es_1.size, es_2.size)

es_all = np.concatenate((es_1, es_2), axis=0)   # all enzymes
num_e = es_all.size
print('Number of nodes of node_type_id 0: {}'.format(num_e))

430 391
Number of nodes of node_type_id 0: 821


In [6]:
node_e = pd.DataFrame({'node_name': es_all, 'node_id': list(range(num_e))}, columns=['node_name', 'node_id'])
index_e = node_e.set_index('node_name')
node_e

Unnamed: 0,node_name,node_id
0,YAL012W,0
1,YAL022C,1
2,YAL054C,2
3,YAL060W,3
4,YAR015W,4
...,...,...
816,BSU40190,816
817,BSU40320,817
818,BSU40340,818
819,BSU40420,819


### Index molecule

In [7]:
mvec.iloc[0, :][2:]

0      0.065057
1     -0.005264
2     -0.128438
3      0.077335
4      0.016711
         ...   
123   -0.042586
124   -0.004259
125     0.09611
126    0.083563
127   -0.038996
Name: 0, Length: 128, dtype: object

In [8]:
def conver_feature_to_list(x):
    return list(x['0':'127'])

In [9]:
num_m = mvec.shape[0]
print('Number of nodes of node_type_id 1: {}'.format(num_m))
node_m = mvec.loc[:, :]
# node_m = pd.DataFrame({'node_name': mvec['Mol'], 'node_id': list(range(num_e, num_e+num_m))}, columns=['node_name', 'node_id'])
# index_m = node_m.set_index('node_name')
node_m['feature'] = node_m.apply(conver_feature_to_list, axis=1)
node_m = node_m.loc[:, ['Mol', 'feature']]
node_m = pd.DataFrame({'node_name': node_m['Mol'], 'node_id': list(range(num_e, num_e+num_m)), 'node_feature': node_m['feature']}, columns=['node_name', 'node_id', 'node_feature'])
index_m = node_m.set_index('node_name')
node_m

Number of nodes of node_type_id 1: 1740


Unnamed: 0,node_name,node_id,node_feature
0,10fthf_c,821,"[0.06505691, -0.005263828, -0.12843789, 0.0773..."
1,10fthf_m,822,"[0.060535252, -0.005715403, -0.13652414, 0.082..."
2,12d3k5m_c,823,"[0.058052193, -0.014662637, -0.12748437, 0.083..."
3,12dgr120_c,824,"[0.052606136, -0.023302361, -0.106627904, 0.06..."
4,12dgr120_p,825,"[0.048056528, -0.021575488, -0.09985389, 0.059..."
...,...,...,...
1735,xylu__D_c,2556,"[0.04630084, -0.014359758, -0.119205356, 0.076..."
1736,xylu__L_c,2557,"[0.04993862, -0.010038005, -0.11455618, 0.0719..."
1737,zn2_c,2558,"[0.044305786, -0.019650882, -0.11681477, 0.071..."
1738,zn2_e,2559,"[0.05291073, -0.015405183, -0.11428592, 0.0637..."


In [10]:
# # ms_1 = mvec_1['meta)name'].unique()         # molecules 1
# # ms_2 = mvec_2['reac_meta_name'].unique()    # molecules 2
# ms_1 = model_1['reac_meta_name'].unique()     # molecules 1
# ms_2 = model_2['reac_meta_name'].unique()     # molecules 2
# print(ms_1.size, ms_2.size)

# ms_all = np.unique(np.concatenate((ms_1, ms_2), axis=0))
# num_m = ms_all.size
# print(num_m)

# index_m = pd.DataFrame({'mole_name': ms_all, 'index': list(range(num_e, num_e+num_m))}, columns=['mole_name', 'index'])
# index_m.set_index('mole_name', inplace=True)
# index_m

## Link: reaction

In [11]:
# Get reaction relations
reac_1 = model_1.loc[:, ['Rule', 'reac_meta_name', 'reac_meta_value']]
reac_1.columns = ['enzyme_name', 'mole_name', 'reac_value']

reac_2 = model_2.loc[:, ['BSU', 'reac_meta_name', 'reac_meta_value']]
reac_2.columns = ['enzyme_name', 'mole_name', 'reac_value']

reac = pd.concat((reac_1, reac_2), axis=0)


# Log the molecure without mvec data
reac['in_mvec'] = reac['mole_name'].apply(lambda x: x in list(node_m['node_name']))
no_mvec_data = reac[reac['in_mvec'] == False]
no_mvec_data.to_csv('no_mvec_data.csv')

print('Before dropping the molecure without mvec data, the number of relations = {}'.format(reac.shape))
reac = reac[reac['in_mvec'] == True]
reac

# Get the index of the enzyme and molecule
reac['e_idx'] = reac['enzyme_name'].apply(lambda x: index_e.loc[x]['node_id'])
reac['m_idx'] = reac['mole_name'].apply(lambda x: index_m.loc[x]['node_id'])
reac

Before dropping the molecure without mvec data, the number of relations = (3348, 4)


Unnamed: 0,enzyme_name,mole_name,reac_value,in_mvec,e_idx,m_idx
0,YAL012W,nh4_c,1.0,True,0,2078
1,YAL012W,cyst__L_c,-1.0,True,0,1459
2,YAL012W,cys__L_c,1.0,True,0,1454
3,YAL012W,2obut_c,1.0,True,0,985
4,YAL022C,cytd_e,-1.0,True,1,1462
...,...,...,...,...,...,...
1516,BSU40420,dcamp_c,1.0,True,819,1480
1517,BSU40420,imp_c,-1.0,True,819,1886
1518,BSU40850,accoa_c,-1.0,True,820,1172
1519,BSU40850,acmalt_c,1.0,True,820,1189


In [13]:
print('Number of nodes with mvec info: {}'.format(len(set(no_mvec_data.mole_name))))

Number of nodes with mvec info: 178


In [14]:
# Get edge_type_id
reac['edge_type_id'] = reac['reac_value'].apply(lambda x: 1 if x > 0 else 0)
# Get node_id_source
reac['node_id_source'] = reac.apply(lambda x: x['e_idx'] if x['edge_type_id'] == 1 else x['m_idx'], axis=1)
# Get node_id_target
reac['node_id_target'] = reac.apply(lambda x: x['m_idx'] if x['edge_type_id'] == 1 else x['e_idx'], axis=1)
# Get edge_weight
reac['edge_weight'] = reac['reac_value'].apply(lambda x: abs(x))
# Reorganize the dataframe
reac = reac.loc[:, [ 'node_id_source', 'node_id_target', 'edge_type_id', 'edge_weight']]
reac.sort_values(by=['edge_type_id', 'node_id_source'], inplace=True)
link = reac
link

Unnamed: 0,node_id_source,node_id_target,edge_type_id,edge_weight
421,821,106,0,1.0
425,821,107,0,1.0
199,821,492,0,1.0
203,821,493,0,1.0
441,821,534,0,1.0
...,...,...,...,...
1510,817,2533,1,1.0
1512,818,1740,1,1.0
1513,818,1732,1,1.0
1516,819,1480,1,1.0


In [15]:
link[link.edge_type_id == 1].shape

(1552, 4)

## Node: enzyme and molecule

#### enzyme

In [16]:
node_e

Unnamed: 0,node_name,node_id
0,YAL012W,0
1,YAL022C,1
2,YAL054C,2
3,YAL060W,3
4,YAR015W,4
...,...,...
816,BSU40190,816
817,BSU40320,817
818,BSU40340,818
819,BSU40420,819


In [17]:
node_e['node_type_id'] = np.zeros(node_e.shape[0], dtype=int)
node_e = node_e.loc[:, ['node_id', 'node_name', 'node_type_id']]
node_e

Unnamed: 0,node_id,node_name,node_type_id
0,0,YAL012W,0
1,1,YAL022C,0
2,2,YAL054C,0
3,3,YAL060W,0
4,4,YAR015W,0
...,...,...,...
816,816,BSU40190,0
817,817,BSU40320,0
818,818,BSU40340,0
819,819,BSU40420,0


#### molecule

In [18]:
node_m['node_type_id'] = np.ones(node_m.shape[0], dtype=int)
node_m = node_m.loc[:, ['node_id', 'node_name', 'node_type_id', 'node_feature']]
node_m

Unnamed: 0,node_id,node_name,node_type_id,node_feature
0,821,10fthf_c,1,"[0.06505691, -0.005263828, -0.12843789, 0.0773..."
1,822,10fthf_m,1,"[0.060535252, -0.005715403, -0.13652414, 0.082..."
2,823,12d3k5m_c,1,"[0.058052193, -0.014662637, -0.12748437, 0.083..."
3,824,12dgr120_c,1,"[0.052606136, -0.023302361, -0.106627904, 0.06..."
4,825,12dgr120_p,1,"[0.048056528, -0.021575488, -0.09985389, 0.059..."
...,...,...,...,...
1735,2556,xylu__D_c,1,"[0.04630084, -0.014359758, -0.119205356, 0.076..."
1736,2557,xylu__L_c,1,"[0.04993862, -0.010038005, -0.11455618, 0.0719..."
1737,2558,zn2_c,1,"[0.044305786, -0.019650882, -0.11681477, 0.071..."
1738,2559,zn2_e,1,"[0.05291073, -0.015405183, -0.11428592, 0.0637..."


In [19]:
node = pd.concat((node_e, node_m), axis=0)
node

Unnamed: 0,node_id,node_name,node_type_id,node_feature
0,0,YAL012W,0,
1,1,YAL022C,0,
2,2,YAL054C,0,
3,3,YAL060W,0,
4,4,YAR015W,0,
...,...,...,...,...
1735,2556,xylu__D_c,1,"[0.04630084, -0.014359758, -0.119205356, 0.076..."
1736,2557,xylu__L_c,1,"[0.04993862, -0.010038005, -0.11455618, 0.0719..."
1737,2558,zn2_c,1,"[0.044305786, -0.019650882, -0.11681477, 0.071..."
1738,2559,zn2_e,1,"[0.05291073, -0.015405183, -0.11428592, 0.0637..."


In [20]:
# def conver_feature_to_list(x):
#     return list(x['0':'127'])

# def reorganize_mvec(mvec):
#     m_col_name = mvec.columns.values.tolist()[0]
#     mvec_feature = mvec.set_index(m_col_name)
#     mvec_feature['feature'] = mvec_feature.apply(conver_feature_to_list, axis=1)
#     mvec_feature = mvec_feature.loc[:, ['feature']]
#     mvec_feature.index.names = ['mole_name']
#     return mvec_feature

# mvec_feature_1 = reorganize_mvec(mvec_1)
# mvec_feature_2 = reorganize_mvec(mvec_2)

# mvec_feature_1.reset_index(inplace=True)
# mvec_feature_1

# def in_2(x):
#     return x in mvec_feature_2.index

# mvec_feature_1['in_2'] = mvec_feature_1['mole_name'].apply(in_2)
# mvec_feature_1[mvec_feature_1['in_2'] == True]
# mvec_feature_1

# mvec_feature_2.reset_index(inplace=True)
# mvec_feature_2

# mvec_feature = pd.concat((mvec_feature_1, mvec_feature_2), axis=0)
# mvec_feature = mvec_feature.loc[:, ['mole_name', 'feature']]
# mvec_feature

# def add_node_id(x):
#     return index_m.loc[x]['index']

# mvec_feature['node_id'] = mvec_feature['mole_name'].apply(add_node_id)
# mvec_feature['node_type'] = np.ones(mvec_feature.shape[0], dtype=int)
# mol_nodes = mvec_feature.loc[:, ['node_id', 'mole_name', 'node_type', 'feature']]
# mol_nodes.columns = ['node_id', 'node_name', 'node_type_id', 'feature']
# mol_nodes

# node = pd.concat((enzyme_nodes, mol_nodes))
# node.sort_values('node_id', inplace=True)
# node.reset_index(inplace=True)
# node.drop('index', axis=1, inplace=True)
# node.to_csv('t.csv')
# node

## Label: km and kcat

In [21]:
labels_1 = model_1.loc[:, ['Rule', 'Km', 'Kcat']]
labels_1.columns = ['enzyme_name', 'Km', 'Kcat']
labels_2 = model_2.loc[:, ['BSU', 'km', 'kcat']]
labels_2.columns = ['enzyme_name', 'Km', 'Kcat']
label = pd.concat((labels_1, labels_2), axis=0)

def check_nan(x):
    return (not pd.isnull(x['Km'])) or (not pd.isnull(x['Kcat']))

label = label[label.apply(check_nan, axis=1)]
label['node_id'] = label['enzyme_name'].apply(lambda x: index_e.loc[x]['node_id'])
label

Unnamed: 0,enzyme_name,Km,Kcat,node_id
1,YAL012W,0.45000,1.41,0
2,YAL012W,0.27500,0.15,0
10,YAL054C,0.13650,,2
11,YAL054C,0.13650,,2
12,YAL054C,1.20000,,2
...,...,...,...,...
1494,BSU39760,0.12000,,812
1495,BSU39760,0.06000,,812
1497,BSU39760,0.06000,1.10,812
1501,BSU39980,0.00428,,813
