Micromet Network
# Reactome Data Integration
Goal: Match Reactome and VHM metabolites

## Preprocessing

In [1]:
import pandas as pd

In [2]:
reactome_mets = pd.read_csv('extracted_data/reactome/reactome_species_with_hmdb_ids.csv', dtype=str)
vmh_mets = pd.read_csv('extracted_data/vmh/vmh_metabolites_with_hmdb_ids.csv', dtype=str)

In [3]:
reactome_mets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238518 entries, 0 to 238517
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   species_id           238518 non-null  object
 1   species_name         238515 non-null  object
 2   species_reactome_id  238518 non-null  object
 3   entity_type          238518 non-null  object
 4   CHEBI                8370 non-null    object
 5   uniprot              166323 non-null  object
 6   ensembl              1310 non-null    object
 7   GRAC                 579 non-null     object
 8   pubchem              1003 non-null    object
 9   ncbi                 28 non-null      object
 10  mirbase              52 non-null      object
 11  ENA                  42 non-null      object
 12  KEGG                 834 non-null     object
 13  hmdb_id              991 non-null     object
 14  name                 991 non-null     object
 15  iupac                985 non-null 

In [4]:
vmh_mets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5321 entries, 0 to 5320
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   met_id                5321 non-null   object
 1   abbreviation          5321 non-null   object
 2   fullName              5321 non-null   object
 3   keggId                2324 non-null   object
 4   pubChemId             1996 non-null   object
 5   unified_abbreviation  5321 non-null   object
 6   CHEBI                 2063 non-null   object
 7   hmdb_id               1601 non-null   object
 8   name                  1601 non-null   object
 9   iupac                 1594 non-null   object
 10  foodb_id              1350 non-null   object
 11  chemspider_id         1270 non-null   object
 12  drugbank_id           475 non-null    object
 13  pdb_id                58 non-null     object
 14  wikipedia_id          799 non-null    object
 15  bigg_id               601 non-null    

In [5]:
reactome_mets.head()

Unnamed: 0,species_id,species_name,species_reactome_id,entity_type,CHEBI,uniprot,ensembl,GRAC,pubchem,ncbi,...,hmdb_id,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id
0,species_10637177,"PEX2:PEX10:PEX12:PEX5S,L:Ub:UBE2D1,2,3:PEX13:P...",R-CEL-8953942,Reactome Complex,,,,,,,...,,,,,,,,,,
1,species_10603252,"UBE2D1,2,3",R-CEL-1234120,Reactome DefinedSet,,P35129,,,,,...,,,,,,,,,,
2,species_10603252,"UBE2D1,2,3",R-CEL-1234120,Reactome DefinedSet,,Q9U1U4,,,,,...,,,,,,,,,,
3,species_10603252,"UBE2D1,2,3",R-CEL-1234120,Reactome DefinedSet,,Q20617,,,,,...,,,,,,,,,,
4,species_10635551,RNF152:RRAGA:GDP:Ub:UBE2N,R-CEL-8938812,Reactome Complex,,,,,,,...,,,,,,,,,,


In [6]:
vmh_mets.head()

Unnamed: 0,met_id,abbreviation,fullName,keggId,pubChemId,unified_abbreviation,CHEBI,hmdb_id,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id
0,1369,xtp,Xanthosine 5-triphosphate,C00700,439296.0,xtp,10049.0,HMDB0000293,Xanthosine 5-triphosphate,"({[({[(2R,3S,4R,5R)-5-(2,6-dihydroxy-9H-purin-...",FDB021932,388429.0,,,,35735.0,XTP
1,2659,docohxeth,Docosahexaenoyl Ethanolamide,,5283451.0,docohxeth,1006090.0,,,,,,,,,,
2,1759,srb_L,L-Sorbose,C00247,441484.0,srb_L,10295.0,HMDB0001266,L-Sorbose,"(2R,3S,4R,5S)-2-(hydroxymethyl)oxane-2,3,4,5-t...",FDB001126,390208.0,,SOE,Sorbose,,SRB_L
3,5322,M00538,1-Naphthol,C11714,7005.0,M00538,10319.0,HMDB0012138,1-Naphthol,naphthalen-1-ol,FDB005841,6739.0,,1NP,1-Naphthol,,M00538
4,4699,M00341,Docosatrienoic acid,C16534,5312557.0,M00341,1038735.0,HMDB0002823,Docosatrienoic acid,"(13Z,16Z,19Z)-docosa-13,16,19-trienoic acid",FDB019146,4471982.0,,,,,M00341


In [7]:
# CHEBI identifier has .0 in vmh_mets as it was originally saved as float. This does not match the reactome_mets CHEBO identifier, which is why we need to split off the .0 part
vmh_mets.CHEBI = vmh_mets.CHEBI.apply(lambda x: x.split('.')[0] if pd.notna(x) else float('NaN'))
vmh_mets[vmh_mets.CHEBI.notna()]

Unnamed: 0,met_id,abbreviation,fullName,keggId,pubChemId,unified_abbreviation,CHEBI,hmdb_id,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id
0,1369,xtp,Xanthosine 5-triphosphate,C00700,439296.0,xtp,10049,HMDB0000293,Xanthosine 5-triphosphate,"({[({[(2R,3S,4R,5R)-5-(2,6-dihydroxy-9H-purin-...",FDB021932,388429,,,,35735,XTP
1,2659,docohxeth,Docosahexaenoyl Ethanolamide,,5283451.0,docohxeth,1006090,,,,,,,,,,
2,1759,srb_L,L-Sorbose,C00247,441484.0,srb_L,10295,HMDB0001266,L-Sorbose,"(2R,3S,4R,5S)-2-(hydroxymethyl)oxane-2,3,4,5-t...",FDB001126,390208,,SOE,Sorbose,,SRB_L
3,5322,M00538,1-Naphthol,C11714,7005.0,M00538,10319,HMDB0012138,1-Naphthol,naphthalen-1-ol,FDB005841,6739,,1NP,1-Naphthol,,M00538
4,4699,M00341,Docosatrienoic acid,C16534,5312557.0,M00341,1038735,HMDB0002823,Docosatrienoic acid,"(13Z,16Z,19Z)-docosa-13,16,19-trienoic acid",FDB019146,4471982,,,,,M00341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4900,2914,tyrtyr,Tyrosyl-Tyrosine,,5057430,tyrtyr,60987,HMDB0029117,Tyrosyl-Tyrosine,2-[2-amino-3-(4-hydroxyphenyl)propanamido]-3-(...,,,,,,,
4903,3478,q8h2,Ubiquinol-8,,25074411,q8h2,61682,HMDB0001060,Ubiquinol 8,"2,3-dimethoxy-5-methyl-6-[(2E,6E,10E,14E,18E,2...",FDB022399,24603698,,,,,Q8H2
4909,5717,udca3s,Ursodeoxycholic acid 3-sulfate,,21252318,udca3s,89499,HMDB0002642,Ursodeoxycholic acid 3-sulfate,"(4R)-4-[(1S,2S,5S,9S,10R,11S,14R,15R)-9-hydrox...",FDB023036,13628379,,,,,
4923,2925,valval,Valyl-Valine,,,valval,176461,,,,,,,,,,


In [8]:
# Same for pubChemId
vmh_mets.pubChemId = vmh_mets.pubChemId.apply(lambda x: x.split('.')[0] if pd.notna(x) else float('NaN'))

### Split species Reactome into metabolites and catalysts
As the reactome species contain metabolites as well as catalysts/enzymes, we will separate them for the matching by checking the role of each species in the reactions table

In [9]:
reactions_reactome = pd.read_csv('extracted_data/reactome/reactome_reactions.csv')

In [10]:
reactions_reactome.reactome_link = reactions_reactome.reactome_link.apply(lambda x: x.split('reactome:')[1])

In [11]:
reactions_reactome.head()

Unnamed: 0,reaction_id,reaction_name,species,reactome_link,reactants,products,enzymes
0,reaction_10635562,RNF152 polyubiquitinates RRAGA,CEL,R-CEL-8938815,"species_10635551,species_10635549","species_10635559,species_10635542,species_1058...",species_10635551
1,reaction_10635564,RNF152 binds RRAGA:GDP and Ubiquitin:UBE2D3,CEL,R-CEL-8938829,"species_10635549,species_10635544,species_1063...",species_10635551,
2,reaction_10637189,"PEX2:PEX10:PEX12 monoubiquitinates PEX5S,L at ...",CEL,R-CEL-8953946,species_10637177,"species_10603252,species_10637186",species_10637177
3,reaction_10637179,"PEX2:PEX10:PEX12 binds PEX5S,L (in PEX5S:PEX13...",CEL,R-CEL-8953917,"species_10637175,species_10637171",species_10637177,
4,reaction_10659826,CaMKK autophosphorylates in the nucleus,DDI,R-DDI-442749,"species_10659818,species_29358","species_113582,species_10659823",species_10659818


In [12]:
# Create set for each component of the reaction: reactants, products and enzymes
reactome_reactants = set(','.join(reactions_reactome[reactions_reactome.reactants.notna()]['reactants'].tolist()).split(','))
reactome_products = set(','.join(reactions_reactome[reactions_reactome.products.notna()]['products'].tolist()).split(','))
reactome_enzymes = set(','.join(reactions_reactome[reactions_reactome.enzymes.notna()]['enzymes'].tolist()).split(','))

In [13]:
# All relevant metabolites should be contained within the reactants and products of the reaction
reactome_metabolites = reactome_reactants | reactome_products

In [14]:
# From the original metabolites data frame select only the metabolites we just extracted
df_reactome_metabolites = reactome_mets[reactome_mets.species_id.isin(reactome_metabolites)]

In [15]:
df_reactome_enzymes = reactome_mets[reactome_mets.species_id.isin(reactome_enzymes)]

In [16]:
print(f'Reactome Metabolites from Reactions: {len(df_reactome_metabolites.drop_duplicates())}')
print(f'Reactome Catalysts from Reactions: {len(df_reactome_enzymes.drop_duplicates())}')
print(f'Sum: {len(df_reactome_metabolites.drop_duplicates()) + len(df_reactome_enzymes.drop_duplicates())}')
intersection = reactome_metabolites.intersection(reactome_enzymes) 
print(f'Intersection between metabolites and enzymes: {len(intersection)}')
print(f'Reactome Metabolites & Catalysts from Reactions: {len(df_reactome_metabolites.drop_duplicates()) + len(df_reactome_enzymes.drop_duplicates()) - len(intersection)}')

Reactome Metabolites from Reactions: 190732
Reactome Catalysts from Reactions: 66984
Sum: 257716
Intersection between metabolites and enzymes: 14277
Reactome Metabolites & Catalysts from Reactions: 243439


## Match Reactome and VMH Metabolites Based on CHEBI, HMDB, KEGG, Pubchem and Name
&rarr; Continue with df_reactome_metabolites

In [17]:
# Cast all to lowercase letters to prevent matching mistakes due to different capitalization
vmh_mets["unified_abbreviation"] = vmh_mets.unified_abbreviation.str.lower()
vmh_mets["fullName"] = vmh_mets.fullName.str.lower()
df_reactome_metabolites["species_name"] = df_reactome_metabolites.species_name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reactome_metabolites["species_name"] = df_reactome_metabolites.species_name.str.lower()


In [18]:
# Create dictionaries so we can match the other database identifiers to the unique VMH abbreviation if we find a match
vmh_mets_chebi = {}
vmh_mets_fullName = {}
vmh_mets_hmdb = {}
vmh_mets_kegg = {}
vmh_mets_pubchem = {}
vmh_mets_abb = set()

In [19]:
def extract_info_vmh_mets(row):
    abb = row.unified_abbreviation
    name = row.fullName
    cheb = row.CHEBI
    hmdb = row.hmdb_id
    kegg = row.keggId
    pubchem = row.pubChemId
    
    vmh_mets_fullName.update({name: abb})

    vmh_mets_abb.add(abb)
    
    if pd.notna(cheb):
        vmh_mets_chebi.update({cheb: abb})
        
    if pd.notna(hmdb):
        vmh_mets_hmdb.update({hmdb: abb})
        
    if pd.notna(kegg):
        vmh_mets_kegg.update({kegg: abb})
        
    if pd.notna(pubchem):
        vmh_mets_pubchem.update({pubchem: abb})

In [20]:
vmh_mets.apply(extract_info_vmh_mets, axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
5316    None
5317    None
5318    None
5319    None
5320    None
Length: 5321, dtype: object

In [21]:
# Create method to match all identifiers we can find in the reactome data frame to the identifiers in the dictionaries we extracted from VMH

def match_to_vmh_mets(row):
    if row.CHEBI in vmh_mets_chebi:
        return vmh_mets_chebi.get(row.CHEBI)
    elif row.KEGG in vmh_mets_kegg:
        return vmh_mets_kegg.get(row.KEGG)
    elif row.pubchem in vmh_mets_pubchem:
        return vmh_mets_pubchem.get(row.pubchem)
    elif row.species_name in vmh_mets_fullName:
         return vmh_mets_fullName.get(row.species_name)
    elif row.species_name in vmh_mets_abb:
        return row.species_name
    elif row.hmdb_id in vmh_mets_hmdb:
        return vmh_mets_hmdb.get(row.hmdb_id)
    elif (pd.notna(row.vmh_id) and row.vmh_id.lower()) in vmh_mets_abb:
        return row.vmh_id.lower()
    else:
        return float('nan')

In [22]:
df_reactome_metabolites['unified_abbreviation'] = df_reactome_metabolites.apply(match_to_vmh_mets, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reactome_metabolites['unified_abbreviation'] = df_reactome_metabolites.apply(match_to_vmh_mets, axis=1)


In [23]:
# Look at the new column and check how it matches with the column vmh_id that was already present in the reactome data frame
df_reactome_metabolites[df_reactome_metabolites.unified_abbreviation.notna()]

Unnamed: 0,species_id,species_name,species_reactome_id,entity_type,CHEBI,uniprot,ensembl,GRAC,pubchem,ncbi,...,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id,unified_abbreviation
13,species_113582,adp,R-ALL-113582,Reactome SimpleEntity,456216,,,,6022,,...,ADP,"[({[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-3,...",FDB021817,5800,,,Adenosine_diphosphate,33496,ADP,adp
16,species_30389,camp,R-ALL-30389,Reactome SimpleEntity,17489,,,,,,...,,,,,,,,,,camp
17,species_76577,amp,R-ALL-76577,Reactome SimpleEntity,16027,,,,6083,,...,Adenosine monophosphate,"{[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-3,4-...",FDB030677,5858,DB00131,,Adenylic_acid,33534,AMP,amp
23,species_113592,atp,R-ALL-113592,Reactome SimpleEntity,30616,,,,5957,,...,Adenosine triphosphate,"({[({[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-...",FDB030683,5742,DB00171,,Adenosine_triphosphate,33477,ATP,atp
24,species_114520,"i(1,4,5)p3",R-ALL-114520,Reactome SimpleEntity,16595,,,,,,...,,,,,,,,,,mi145p
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238003,species_5653964,"rragc,rragd:gdp",R-HSA-5653964,Reactome DefinedSet,17552,,,,,,...,,,,,,,,,,gdp
238294,species_9742698,"rraga, rragb:gtp",R-MMU-5653946,Reactome DefinedSet,15996,,,,,,...,,,,,,,,,,gtp
238310,species_9742712,"rragc,rragd:gdp",R-MMU-5653964,Reactome DefinedSet,17552,,,,,,...,,,,,,,,,,gdp
238437,species_9792284,hemoglobin dimer,R-MMU-2168876,Reactome DefinedSet,17627,,,,,,...,,,,,,,,,,pheme


&rarr; As we can see we were able to find new matches between the reactome and VMH metabolites

In [24]:
df_reactome_metabolites[df_reactome_metabolites.vmh_id.str.lower() == df_reactome_metabolites.unified_abbreviation]

Unnamed: 0,species_id,species_name,species_reactome_id,entity_type,CHEBI,uniprot,ensembl,GRAC,pubchem,ncbi,...,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id,unified_abbreviation
13,species_113582,adp,R-ALL-113582,Reactome SimpleEntity,456216,,,,6022,,...,ADP,"[({[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-3,...",FDB021817,5800,,,Adenosine_diphosphate,33496,ADP,adp
17,species_76577,amp,R-ALL-76577,Reactome SimpleEntity,16027,,,,6083,,...,Adenosine monophosphate,"{[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-3,4-...",FDB030677,5858,DB00131,,Adenylic_acid,33534,AMP,amp
23,species_113592,atp,R-ALL-113592,Reactome SimpleEntity,30616,,,,5957,,...,Adenosine triphosphate,"({[({[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-...",FDB030683,5742,DB00171,,Adenosine_triphosphate,33477,ATP,atp
25,species_74016,ca2+,R-ALL-74016,Reactome SimpleEntity,29108,,,,271,,...,Calcium,calcium(2+) ion,FDB003513,266,,,Calcium,33764,CA2,ca2
26,species_29358,atp,R-ALL-29358,Reactome SimpleEntity,30616,,,,5957,,...,Adenosine triphosphate,"({[({[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-...",FDB030683,5742,DB00171,,Adenosine_triphosphate,33477,ATP,atp
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228279,species_10560008,n,R-DME-8855840,Reactome EntityWithAccessionedSequence,17196,P07207,,,6267,,...,L-Asparagine,(2S)-2-amino-3-carbamoylpropanoic acid,FDB000787,6031,DB00174,,Asparagine,34055,ASN_L,asn_l
228281,species_10560014,n,R-DME-8855833,Reactome EntityWithAccessionedSequence,17196,P07207,,,6267,,...,L-Asparagine,(2S)-2-amino-3-carbamoylpropanoic acid,FDB000787,6031,DB00174,,Asparagine,34055,ASN_L,asn_l
228337,species_10533327,n,R-DME-2064204,Reactome EntityWithAccessionedSequence,17196,P07207,,,6267,,...,L-Asparagine,(2S)-2-amino-3-carbamoylpropanoic acid,FDB000787,6031,DB00174,,Asparagine,34055,ASN_L,asn_l
228403,species_10539805,n,R-DME-3791130,Reactome EntityWithAccessionedSequence,17196,P07207,,,6267,,...,L-Asparagine,(2S)-2-amino-3-carbamoylpropanoic acid,FDB000787,6031,DB00174,,Asparagine,34055,ASN_L,asn_l


In [25]:
df_reactome_metabolites[(df_reactome_metabolites.vmh_id.notna()) & (df_reactome_metabolites.vmh_id.str.lower() != df_reactome_metabolites.unified_abbreviation)]

Unnamed: 0,species_id,species_name,species_reactome_id,entity_type,CHEBI,uniprot,ensembl,GRAC,pubchem,ncbi,...,name,iupac,foodb_id,chemspider_id,drugbank_id,pdb_id,wikipedia_id,bigg_id,vmh_id,unified_abbreviation
361,species_158543,gsh,R-ALL-158543,Reactome SimpleEntity,16856,,,,124886,,...,Glutathione,(2S)-2-amino-4-{[(1R)-1-[(carboxymethyl)carbam...,FDB001498,111188,DB00143,,Glutathione,33669,F6P,gthrd
381,species_1247892,gsh,R-ALL-1247892,Reactome SimpleEntity,16856,,,,124886,,...,Glutathione,(2S)-2-amino-4-{[(1R)-1-[(carboxymethyl)carbam...,FDB001498,111188,DB00143,,Glutathione,33669,F6P,gthrd
382,species_29450,gsh,R-ALL-29450,Reactome SimpleEntity,16856,,,,124886,,...,Glutathione,(2S)-2-amino-4-{[(1R)-1-[(carboxymethyl)carbam...,FDB001498,111188,DB00143,,Glutathione,33669,F6P,gthrd
390,species_162756,udp-glcnac,R-ALL-162756,Reactome SimpleEntity,16264,,,,445675,,...,Uridine diphosphate-N-acetylglucosamine,"[({[(2R,3S,4R,5R)-5-(2,4-dioxo-1,2,3,4-tetrahy...",FDB021930,393240,DB03397,,,33638,UDPG,uacgam
674,species_5635947,ade,R-ALL-5635947,Reactome SimpleEntity,16708,,,,190,,...,Adenine,7H-purin-6-amine,FDB012266,185,DB00173,,Adenine,34039,7DHCHSTEROL,ade
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139958,species_2066770,dha,R-ALL-2066770,Reactome SimpleEntity,28125,,,,5881,,...,Dehydroepiandrosterone,"(1S,2R,5S,10R,11S,15S)-5-hydroxy-2,15-dimethyl...",FDB021808,8036443,DB01708,,Dehydroepiandrosterone,37131,56DURA,crvnc
147500,species_2162231,glcnac,R-ALL-2162231,Reactome SimpleEntity,17411,,,,439174,,...,N-Acetyl-D-glucosamine,"N-[(3R,4R,5S,6R)-2,4,5-trihydroxy-6-(hydroxyme...",FDB008032,388319,,,,34006,ORN,acgam
155616,species_2872443,na+,R-ALL-2872443,Reactome SimpleEntity,29101,,,,923,,...,Sodium,sodium(1+) ion,FDB003523,899,,,Sodium,37376,K,na1
169184,species_8939024,coa-sh,R-ALL-8939024,Reactome SimpleEntity,15346,,,,87642,,...,Coenzyme A,"{[(2R,3S,4R,5R)-5-(6-amino-9H-purin-9-yl)-4-hy...",FDB022614,,DB01992,,Coenzyme_A,,XOL24OH,coa


&rarr; There are 430 metabolites for which the new found abbreviations match the already known ones, and 138 metabolites for which the abbreviations differ. Will have to take a look at these metabolites manually to see which abbreviation is the correct one.

In [26]:
df_reactome_metabolites.count()

species_id              190732
species_name            190731
species_reactome_id     190732
entity_type             190732
CHEBI                     7338
uniprot                 127191
ensembl                   1257
GRAC                       513
pubchem                    968
ncbi                        28
mirbase                     46
ENA                         42
KEGG                       814
hmdb_id                    955
name                       955
iupac                      949
foodb_id                   802
chemspider_id              787
drugbank_id                496
pdb_id                      37
wikipedia_id               743
bigg_id                    455
vmh_id                     568
unified_abbreviation      4240
dtype: int64

&rarr; 4240 metabolites could be matched in total (see unified_abbreviations), which is a lot more than the 568 vmh_ids we had in the beginning