In [9]:
import pandas as pd 
import re
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# link to csv file to import to df
openTECR_csv = '/home/jackmcgoldrick/openTECR/data/TECRDB.csv'

In [3]:
# converting to pandas df
data_openTECR = pd.read_csv(openTECR_csv)

In [4]:
data_openTECR.head()

Unnamed: 0,id,url,reference,method,eval,EC,enzyme_name,reaction,description,K,K_prime,temperature,ionic_strength,p_h,p_mg
0,https://w3id.org/related-to/doi.org/10.5281/ze...,http://xpdb.nist.gov/enzyme_thermodynamics/enz...,07LIN/ALG,spectrophotometry,A,1.1.1.87,homoisocitrate dehydrogenase,kegg:C05662 + kegg:C00003 = kegg:C00322 + kegg...,"(1R,2S)-1-hydroxybutane-1,2,4-tricarboxylate(a...",,0.45,298.15,,7.5,
1,https://w3id.org/related-to/doi.org/10.5281/ze...,http://xpdb.nist.gov/enzyme_thermodynamics/enz...,63GRE,spectrophotometry,C,3.5.4.9,methenyltetrahydrofolate cyclohydrolase,kegg:C00445 + kegg:C00001 = kegg:C00234,"5,10-methenyltetrahydrofolate(aq) + H2O(l) = 1...",,4.2,298.15,,6.5,
2,https://w3id.org/related-to/doi.org/10.5281/ze...,http://xpdb.nist.gov/enzyme_thermodynamics/enz...,67ENG/DEN,spectrophotometry,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,18.0,310.15,,7.3,2.96
3,https://w3id.org/related-to/doi.org/10.5281/ze...,http://xpdb.nist.gov/enzyme_thermodynamics/enz...,67ENG/DEN,spectrophotometry,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,25.0,310.15,,7.3,2.8
4,https://w3id.org/related-to/doi.org/10.5281/ze...,http://xpdb.nist.gov/enzyme_thermodynamics/enz...,67ENG/DEN,spectrophotometry,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,33.0,310.15,,7.3,2.3


In [5]:
# kegg identifiers stored in 'reaction' column - retrieve into df series or py list
keggs_series = data_openTECR['reaction']

In [6]:
keggs_series

0       kegg:C05662 + kegg:C00003 = kegg:C00322 + kegg...
1                 kegg:C00445 + kegg:C00001 = kegg:C00234
2                               kegg:C00311 = kegg:C00158
3                               kegg:C00311 = kegg:C00158
4                               kegg:C00311 = kegg:C00158
                              ...                        
4539    kegg:C03373 + kegg:C00288 = kegg:C04751 + kegg...
4540    kegg:C00234 + kegg:C04677 = kegg:C00101 + kegg...
4541              kegg:C00101 + kegg:C00067 = kegg:C00143
4542              kegg:C00199 + kegg:C00067 = kegg:C06019
4543                            kegg:C06019 = kegg:C00085
Name: reaction, Length: 4544, dtype: object

In [7]:
# Extract KEGG identifiers from the reaction column
def extract_kegg_identifiers(series):
    all_identifiers = []
    for reaction in series:
        # Use regex to find all occurrences of "kegg:<identifier>"
        matches = re.findall(r'kegg:[A-Za-z0-9]+', reaction)
        all_identifiers.extend(matches)
    
    # Remove duplicates by converting to a set, then back to a list
    unique_identifiers = list(set(all_identifiers))
    return unique_identifiers

In [8]:
# Apply function to the 'reaction' column
keggs_list = extract_kegg_identifiers(data_openTECR['reaction'])

# Display the result
print(keggs_list)


['kegg:C04752', 'kegg:C00407', 'kegg:C01545', 'kegg:C00417', 'kegg:C00197', 'kegg:C00492', 'kegg:C00357', 'kegg:C14177', 'kegg:C04442', 'kegg:C04741', 'kegg:C00469', 'kegg:C00013', 'kegg:C03149', 'kegg:C01653', 'kegg:C03167', 'kegg:C00942', 'kegg:C02052', 'kegg:C00979', 'kegg:C00318', 'kegg:C00160', 'kegg:C01277', 'kegg:C00189', 'kegg:C00123', 'kegg:C00294', 'kegg:C03017', 'kegg:C02571', 'kegg:C00300', 'kegg:C00870', 'kegg:C00499', 'kegg:C01551', 'kegg:C00109', 'kegg:C01142', 'kegg:C02930', 'kegg:C00787', 'kegg:C00311', 'kegg:C03392', 'kegg:C01157', 'kegg:C01217', 'kegg:C05984', 'kegg:C03210', 'kegg:C00186', 'kegg:C03394', 'kegg:C01112', 'kegg:C03564', 'kegg:C00363', 'kegg:C01636', 'kegg:C04618', 'kegg:C00256', 'kegg:C03044', 'kegg:C00114', 'kegg:C00624', 'kegg:C02988', 'kegg:C01100', 'kegg:C16074', 'kegg:C00126', 'kegg:C08492', 'kegg:C00252', 'kegg:C05439', 'kegg:C02780', 'kegg:C00207', 'kegg:C03599', 'kegg:C00394', 'kegg:C00905', 'kegg:C03875', 'kegg:C01089', 'kegg:C02048', 'kegg:C03

In [33]:
# Remove the 'kegg:' prefix
kegg_ids = [id.split(":")[1] for id in keggs_list]

print(len(kegg_ids))

603


## Linking to ReconXKG SPARQL endpoint to facilitate Molecule cross-mapping

ReconXKG is a knowledge graph containing info across multiple sites including VMH, MetaNetX, SwissLipids, Reactome and many more. This allows for metabolite entries across various platforms to be mapped accordingly. This resource will be leverage here in the form of a SPARQL query which will aim to return all Identifiers associated with an input Kegg Identifier.

In [36]:
def kegg_graphdb_single_row(kegg_id):
    # Regex patterns for each identifier type
    identifier_patterns = {
        "CHEBI": r"CHEBI",
        "BiGG Metabolite": r"bigg\.metabolite:",
        "HMDB": r"hmdb:",
        "KEGG Compound": r"kegg\.compound:",
        "MetaCyc": r"metacyc\.",
        "SABIORK": r"sabiork\.",
        "SEED": r"seed\.",
        "MNX": r"rdf\.metanetx\.org",
        "VMH": r"vmhmetabolite:",
        "Reactome": r"reactome:",
        "SwissLipids": r"SLM:",
        "EnviPath": r"envipath\."
    }

    # SPARQL query
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX mnx: <https://rdf.metanetx.org/schema/>
    PREFIX keggC: <https://www.genome.jp/kegg/>

    SELECT DISTINCT ?keggID ?relatedIDs
    WHERE {{
      ?mnxChemID mnx:chemXref ?rawKeggUri .
      BIND(?mnxChemID AS ?tempChemID)
      FILTER(REGEX(STR(?rawKeggUri), "kegg", "i"))
      BIND(?rawKeggUri AS ?keggID)
      {{
        ?tempChemID mnx:chemXref ?relatedIDs .
      }}
      UNION
      {{
        BIND(?tempChemID AS ?relatedIDs)
      }}
      ?keggID rdfs:label ?keggLabel .
      FILTER(?relatedIDs != ?keggID)
      FILTER(REGEX(?keggLabel, "^keggC:{kegg_id}$", "i"))
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()

    # Initialize a single row for the given KEGG ID
    single_row = {
        "KEGG_ID": kegg_id,
        **{key: None for key in identifier_patterns.keys()},  # Initialize columns for each pattern
        "Extra Identifiers": []  # To store any unmatched identifiers
    }

    # Process results
    for entry in results["results"]["bindings"]:
        related_id = entry.get("relatedIDs", {}).get("value", "")
        matched = False

        # Match related_id against identifier patterns
        for identifier, pattern in identifier_patterns.items():
            if re.search(pattern, related_id, re.IGNORECASE):
                # If the column is empty, populate it; else append (optional depending on use case)
                if single_row[identifier] is None:
                    single_row[identifier] = related_id
                else:
                    # Optional: Store multiple matches for the same pattern
                    single_row[identifier] += f"; {related_id}"
                matched = True
                break

        # If no match, add to Extra Identifiers
        if not matched:
            single_row["Extra Identifiers"].append(related_id)

    # Convert "Extra Identifiers" to a string for storage
    single_row["Extra Identifiers"] = "; ".join(single_row["Extra Identifiers"])

    return single_row


### Test 1

In [37]:
result = kegg_graphdb_single_row("C04752")
print(result)


{'KEGG_ID': 'C04752', 'CHEBI': 'https://identifiers.org/CHEBI:16629; https://identifiers.org/CHEBI:57841; http://purl.obolibrary.org/obo/CHEBI_11612; http://purl.obolibrary.org/obo/CHEBI_1194; http://purl.obolibrary.org/obo/CHEBI_11953; http://purl.obolibrary.org/obo/CHEBI_16629; http://purl.obolibrary.org/obo/CHEBI_19684; http://purl.obolibrary.org/obo/CHEBI_20308; http://purl.obolibrary.org/obo/CHEBI_57841', 'BiGG Metabolite': 'https://identifiers.org/bigg.metabolite:2mahmp', 'HMDB': 'https://identifiers.org/hmdb:HMDB0304168', 'KEGG Compound': 'https://identifiers.org/kegg.compound:C04752', 'MetaCyc': 'https://identifiers.org/metacyc.compound:AMINO-HYDROXYMETHYL-METHYLPYRIMIDINE-PP', 'SABIORK': 'https://identifiers.org/sabiork.compound:1832; https://sabiork.h-its.org/newSearch?q=1832', 'SEED': 'https://identifiers.org/seed.compound:cpd02894; https://modelseed.org/biochem/compounds/M_cpd02894; https://modelseed.org/biochem/compounds/cpd02894', 'MNX': 'https://rdf.metanetx.org/chem/MNX

### Test 2

In [40]:
test_keggs = ['C04752',
 'C00407',
 'C01545',
 'C00417',
 'C00197',
 'C00492',
 'C00357',
 'C14177',
 'C04442',
 'C04741',
 'C00469',]

In [41]:
all_results = [kegg_graphdb_single_row(kegg_id) for kegg_id in test_keggs]

In [42]:
test_keggs = pd.DataFrame(all_results)
print(test_keggs)


   KEGG_ID                                              CHEBI  \
0   C04752  https://identifiers.org/CHEBI:16629; https://i...   
1   C00407  https://identifiers.org/CHEBI:17191; https://i...   
2   C01545  https://identifiers.org/CHEBI:17935; http://pu...   
3   C00417  https://identifiers.org/CHEBI:16383; https://i...   
4   C00197  https://identifiers.org/CHEBI:17050; https://i...   
5   C00492  https://identifiers.org/CHEBI:16634; https://i...   
6   C00357  https://identifiers.org/CHEBI:15784; https://i...   
7   C14177  https://identifiers.org/CHEBI:16585; https://i...   
8   C04442  https://identifiers.org/CHEBI:15925; https://i...   
9   C04741  https://identifiers.org/CHEBI:15544; https://i...   
10  C00469  https://identifiers.org/CHEBI:16236; https://i...   

                                      BiGG Metabolite  \
0      https://identifiers.org/bigg.metabolite:2mahmp   
1      https://identifiers.org/bigg.metabolite:ile__L   
2       https://identifiers.org/bigg.metabolite:

In [45]:
test_keggs.to_csv("kegg_mappingtest_results.csv", index=False)

Test performed well, 55 seconds to process ten metabolites and retrieve there related identifiers. Will manually expect to ensure accuracy of results, once validated, will run full test to cross-map all metabolites in the NOOR dataset.

## Running Code on Entire dataset

In [58]:
all_results = [kegg_graphdb_single_row(kegg_id) for kegg_id in kegg_ids]

In [59]:
all_results

[{'KEGG_ID': 'C04752',
  'CHEBI': 'https://identifiers.org/CHEBI:16629; https://identifiers.org/CHEBI:57841; http://purl.obolibrary.org/obo/CHEBI_11612; http://purl.obolibrary.org/obo/CHEBI_1194; http://purl.obolibrary.org/obo/CHEBI_11953; http://purl.obolibrary.org/obo/CHEBI_16629; http://purl.obolibrary.org/obo/CHEBI_19684; http://purl.obolibrary.org/obo/CHEBI_20308; http://purl.obolibrary.org/obo/CHEBI_57841',
  'BiGG Metabolite': 'https://identifiers.org/bigg.metabolite:2mahmp',
  'HMDB': 'https://identifiers.org/hmdb:HMDB0304168',
  'KEGG Compound': 'https://identifiers.org/kegg.compound:C04752',
  'MetaCyc': 'https://identifiers.org/metacyc.compound:AMINO-HYDROXYMETHYL-METHYLPYRIMIDINE-PP',
  'SABIORK': 'https://identifiers.org/sabiork.compound:1832; https://sabiork.h-its.org/newSearch?q=1832',
  'SEED': 'https://identifiers.org/seed.compound:cpd02894; https://modelseed.org/biochem/compounds/M_cpd02894; https://modelseed.org/biochem/compounds/cpd02894',
  'MNX': 'https://rdf.meta

In [68]:
def clean_multiple_uris(uris):
    """
    Cleans semicolon-separated URIs in a single cell.
    For example:
    'https://identifiers.org/bigg.metabolite:2mahmp; https://identifiers.org/seed:cpd02894'
    -> 'bigg.metabolite:2mahmp; seed:cpd02894'
    """
    if isinstance(uris, str):  # Ensure the value is a string
        # Split by the delimiter, clean each URI, and join them back
        return "; ".join([uri.split("/")[-1] for uri in uris.split("; ")])
    return uris  # Return as-is if not a string


In [69]:
mapped_identifiers = pd.DataFrame(all_results)

In [70]:
# Apply the cleaning function to all columns except KEGG_ID
for col in mapped_identifiers.columns:
    if col != "KEGG_ID":  # Skip cleaning for the KEGG_ID column
        mapped_identifiers[col] = mapped_identifiers[col].apply(clean_multiple_uris)

print(mapped_identifiers)

    KEGG_ID                                              CHEBI  \
0    C04752  CHEBI:16629; CHEBI:57841; CHEBI_11612; CHEBI_1...   
1    C00407  CHEBI:17191; CHEBI:32604; CHEBI:32605; CHEBI:3...   
2    C01545  CHEBI:17935; CHEBI_11268; CHEBI_17935; CHEBI_2...   
3    C00417  CHEBI:16383; CHEBI:32805; CHEBI_10482; CHEBI_1...   
4    C00197  CHEBI:17050; CHEBI:17794; CHEBI:57998; CHEBI:5...   
..      ...                                                ...   
598  C04105  CHEBI:18080; CHEBI:48068; CHEBI:58372; CHEBI:5...   
599  C02588  CHEBI:17616; CHEBI_12784; CHEBI_17616; CHEBI_2...   
600  C03127                                               None   
601  C00041  CHEBI_57972; CHEBI:16977; CHEBI:229589; CHEBI:...   
602  C00137  CHEBI:17268; CHEBI_10601; CHEBI_12826; CHEBI_1...   

                                       BiGG Metabolite  \
0                               bigg.metabolite:2mahmp   
1                               bigg.metabolite:ile__L   
2                                

In [None]:
# Remove the 'Extra Identifiers' column
mapped_identifiers = mapped_identifiers.drop(columns=["KEGG Compound"])

print(mapped_identifiers)


KeyError: "['KEGG Compound'] not found in axis"

In [75]:
mapped_identifiers.to_csv("openTECR_mapped_metabolites.csv", index=False)

### Determining if any entries were not mapped

In [76]:
# Check for rows where all columns except KEGG_ID are empty
unmapped_kegg = mapped_identifiers[mapped_identifiers.drop(columns=["KEGG_ID"]).isnull().all(axis=1)]

# Count the number of unmapped KEGG IDs
num_unmapped = unmapped_kegg.shape[0]

print(f"Number of unmapped KEGG IDs: {num_unmapped}")
print("Unmapped KEGG IDs:")
print(unmapped_kegg)


Number of unmapped KEGG IDs: 0
Unmapped KEGG IDs:
Empty DataFrame
Columns: [KEGG_ID, CHEBI, BiGG Metabolite, HMDB, MetaCyc, SABIORK, SEED, MNX, VMH, Reactome, SwissLipids, EnviPath, Extra Identifiers]
Index: []
