# Create ChEBI reference files

1. REF_CHEBI2LABEL: dictionary of chebi to label
2. REF_CHEBI2FORMULA: dictionary of chebi to shortened formula
3. REF_CHEBI2NAMES: dictionary of chebi to all synonyms

In [8]:
import urllib.request
import gzip
import compress_pickle
import os
import owlready2
import xmltodict
import string
import pandas as pd
import numpy as np
import re
DATA_DIR = '/Users/luna/Desktop/CRBM/AMAS_proj/Data'

## Data
Data are downloaded from the ChEBI website: https://www.ebi.ac.uk/chebi/downloadsForward.do.   
Ontology file (.owl.gz) are obtained using the link: https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz.

In [8]:
# download chebi.owl.gz and decompress it
chebi_owl_url = "https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz"
chebi_owl_file = os.path.join(DATA_DIR, "chebi.owl.gz")
urllib.request.urlretrieve(chebi_owl_url, chebi_owl_file)
with gzip.open(chebi_owl_file, 'rb') as f_in:
    with open(os.path.join(DATA_DIR, "chebi.owl"), 'wb') as f_out:
        f_out.write(f_in.read())

# convert chebi.owl to chebi.xml
onto = owlready2.get_ontology(os.path.join(DATA_DIR, "chebi.owl")).load()
onto.save(os.path.join(DATA_DIR, "chebi.xml"))

In [9]:
contents = open(os.path.join(DATA_DIR, "chebi.xml")).read()
ch = xmltodict.parse(contents)
chebis = ch['rdf:RDF']['owl:Class']
len(chebis)

221729

## Create REF_CHEBI2LABEL

In [11]:
## chebi ID: Chebi Label (name)
## mapping is done only with primary ids
chebi2label = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text'].replace('CHEBI:', '')
    chebi_label = one_chebi['rdfs:label']['#text']
    chebi2label[primary_id] = chebi_label
len(chebi2label)

203109

In [12]:
chebi2label

{'10': '(+)-Atherospermoline',
 '100': '(-)-medicarpin',
 '10000': 'Vismione D',
 '100000': '(2S,3S,4R)-3-[4-(3-cyclopentylprop-1-ynyl)phenyl]-4-(hydroxymethyl)-1-(2-methoxy-1-oxoethyl)-2-azetidinecarbonitrile',
 '100001': 'N-[(2R,3S,6R)-2-(hydroxymethyl)-6-[2-[[oxo-[4-(trifluoromethyl)anilino]methyl]amino]ethyl]-3-oxanyl]-3-pyridinecarboxamide',
 '100002': '3-chloro-N-[(5S,6S,9S)-5-methoxy-3,6,9-trimethyl-2-oxo-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]benzenesulfonamide',
 '100003': '(4R,7S,8R)-8-methoxy-4,7,10-trimethyl-11-oxo-14-(1-oxobutylamino)-N-propyl-2-oxa-5,10-diazabicyclo[10.4.0]hexadeca-1(12),13,15-triene-5-carboxamide',
 '100004': '1-(2,5-difluorophenyl)-3-[(5S,6S,9S)-5-methoxy-3,6,9-trimethyl-2-oxo-8-[oxo(2-pyrazinyl)methyl]-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]urea',
 '100005': 'N-[(1S,3S,4aS,9aR)-1-(hydroxymethyl)-3-[2-oxo-2-(1-piperidinyl)ethyl]-3,4,4a,9a-tetrahydro-1H-pyrano[3,4-b]benzofuran-6-yl]-3-methoxybenzenesulfon

In [13]:
compress_pickle.dump(chebi2label, os.path.join(DATA_DIR,'chebi2label_9jun2025.lzma'),
                     compression="lzma", set_default_extension=False)

## Create REF_CHEBI2FORMULA

In [14]:
# just get the first of the formulae if multiple are given. 
chebi_full_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi and 'cheb:formula' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text'].replace('CHEBI:', '')
    if isinstance(one_chebi['cheb:formula'], list):
      chebi_formula = [val['#text'] for val in one_chebi['cheb:formula']]
    else:
      chebi_formula = one_chebi['cheb:formula']['#text']
    chebi_full_formula[primary_id] = chebi_formula
len(chebi_full_formula)

191368

In [15]:
def removeAtom(input_formula, atoms_to_remove):
  """
  Remove a list of Atoms
  from the string formula.
  Primary goal is to remove
  H and D (heavy hydrogen). 
  :param str input_formula:
  :param list-str atoms_to_remove:
  :return str:
  """
  idx_to_remove = []
  letters = re.findall('[A-Z][a-z]?|\d+|.', input_formula)
  for one_letter in atoms_to_remove:
    if one_letter in letters:
      one_idx = letters.index(one_letter)
      idx_to_remove.append(one_idx)
      # if one_idx is the last element, pass
      if len(letters) == (one_idx+1):
        pass
      elif letters[one_idx+1].isdigit():
        idx_to_remove.append(one_idx+1)
  res = [val for idx, val in enumerate(letters) if idx not in idx_to_remove]
  return "".join(res)

# We will be conservative, will choose all formulas mapped from each CHEBI IDs
def getShortenedCHEBIFormula(one_info):
  """
  Extract chemical formula
  and return it or None.
  Formulas are shortened by removing H and D.
  
  Parameters
  ----------
  one_info: collections.OrderedDict
  
  Returns
  -------
  str/None
  """
  if 'cheb:formula' in one_info:
    # if multiple formulae are given, choose the shortest (rare situation, imo)
    if isinstance(one_info['cheb:formula'], list):
      form = sorted([val['#text'] for val in one_info['cheb:formula']], key=len)[0]
    else:
      form = one_info['cheb:formula']['#text']

    res = removeAtom(input_formula=form, atoms_to_remove=['H', 'D'])
    # if res is only composed of 'H' or 'D'. assign 'H'
    if res == '':
      res = 'H'
  else:
    res = None
  return res

chebi_shortened_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text'].replace('CHEBI:', '')
    one_shortened_formula = getShortenedCHEBIFormula(one_chebi)
    if one_shortened_formula is not None:
      chebi_shortened_formula[primary_id] = one_shortened_formula

print(len(chebi_shortened_formula))

  letters = re.findall('[A-Z][a-z]?|\d+|.', input_formula)


191368


In [16]:
chebi_shortened_formula

{'10': 'C36N2O6',
 '100': 'C16O4',
 '10000': 'C25O5',
 '100000': 'C22N2O3',
 '100001': 'C22F3N4O4',
 '100002': 'C23ClN3O5S',
 '100003': 'C25N4O5',
 '100004': 'C29F2N6O5',
 '100005': 'C26N2O7S',
 '100006': 'C23F2N3O6',
 '100007': 'C33N4O5S',
 '100008': 'C19N2O4',
 '100009': 'C17FN2O3S',
 '10001': 'C21O7',
 '100010': 'C28N3O4',
 '100011': 'C23FN2O5',
 '100012': 'C28N5O5',
 '100013': 'C27N4O5',
 '100014': 'C22N4O6S',
 '100015': 'C18ClN2O5',
 '100016': 'C19N3O4',
 '100017': 'C27N4O5',
 '100018': 'C33FN4O4S',
 '100019': 'C20F3N5O4',
 '10002': 'C13O4',
 '100020': 'C23N3O4',
 '100021': 'C31FN3O6S',
 '100022': 'C28N3O7',
 '100023': 'C28N3O4',
 '100024': 'C23N3O2',
 '100025': 'C31N4O6S',
 '100026': 'C28N3O6',
 '100027': 'C26N3O6',
 '100028': 'C30N2O7',
 '100029': 'C29N3O5',
 '10003': 'C17N4O10.(H2O4S)n',
 '100030': 'C24N3O6S',
 '100031': 'C19N2O5',
 '100032': 'C25FN2O3',
 '100033': 'C27N4O5',
 '100034': 'C15FNO6S',
 '100035': 'C31N2O5',
 '100036': 'C25N5O6',
 '100037': 'C24FN4O4',
 '100038': 'C

In [17]:
with open(os.path.join(DATA_DIR, 'chebi_shortened_formula_9jun2025.lzma'), 'wb') as handle:
    compress_pickle.dump(chebi_shortened_formula, handle, compression="lzma", set_default_extension=False)

## Create REF_CHEBI2NAMES

This is used for mapping chebi ids to standard name and synonyms, and used in the RAG approach.

In [18]:
# save all synonyms for each chebi id to a dictionary
chebi2names = dict()
for one_chebi in chebis:
    if 'oboI:id' in one_chebi:
        primary_id = one_chebi['oboI:id']['#text'].replace('CHEBI:', '')
        synonyms = set()  # Using a set to remove duplicates
        # standard name
        if 'rdfs:label' in one_chebi:
            standard_name = one_chebi['rdfs:label']['#text'].lower()
            synonyms.add(standard_name)
        # synonyms
        # 'oboI:hasExactSynonym'
        if 'oboI:hasExactSynonym' in one_chebi:
            exact_synonyms = one_chebi['oboI:hasExactSynonym']
            if isinstance(exact_synonyms, list):  # If it's a list, extract all texts
                for syn in exact_synonyms:
                    synonyms.add(syn['#text'].lower())
            else:  # If it's a single dictionary, extract its text
                synonyms.add(exact_synonyms['#text'].lower())
        # 'oboI:hasRelatedSynonym'
        if 'oboI:hasRelatedSynonym' in one_chebi:
            related_synonyms = one_chebi['oboI:hasRelatedSynonym']
            if isinstance(related_synonyms, list):  # If it's a list, extract all texts
                for syn in related_synonyms:
                    synonyms.add(syn['#text'].lower())
            else:  # If it's a single dictionary, extract its text
                synonyms.add(related_synonyms['#text'].lower())
        if not synonyms:
            continue
        chebi2names[primary_id] = list(synonyms)  # Convert set back to list

len(chebi2names)

203109

In [19]:
chebi2names

{'10': ['(+)-atherospermoline'],
 '100': ['medicarpin',
  '(-)-medicarpin',
  '(6ar,11ar)-9-methoxy-6a,11a-dihydro-6h-[1]benzofuro[3,2-c]chromen-3-ol'],
 '10000': ['vismione d'],
 '100000': ['(2s,3s,4r)-3-[4-(3-cyclopentylprop-1-ynyl)phenyl]-4-(hydroxymethyl)-1-(2-methoxy-1-oxoethyl)-2-azetidinecarbonitrile'],
 '100001': ['n-[(2r,3s,6r)-2-(hydroxymethyl)-6-[2-[[oxo-[4-(trifluoromethyl)anilino]methyl]amino]ethyl]-3-oxanyl]-3-pyridinecarboxamide'],
 '100002': ['3-chloro-n-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]benzenesulfonamide'],
 '100003': ['(4r,7s,8r)-8-methoxy-4,7,10-trimethyl-11-oxo-14-(1-oxobutylamino)-n-propyl-2-oxa-5,10-diazabicyclo[10.4.0]hexadeca-1(12),13,15-triene-5-carboxamide'],
 '100004': ['1-(2,5-difluorophenyl)-3-[(5s,6s,9s)-5-methoxy-3,6,9-trimethyl-2-oxo-8-[oxo(2-pyrazinyl)methyl]-11-oxa-3,8-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]urea'],
 '100005': ['n-[(1s,3s,4as,9ar)-1-(hydroxymethyl)-

In [20]:
with open(os.path.join(DATA_DIR, 'chebi2names_9jun2025.lzma'), 'wb') as handle:
    compress_pickle.dump(chebi2names, handle, compression="lzma", set_default_extension=False)

In [21]:
# save all synonyms for each chebi id to a dictionary
# this one all symbols (characters except numbers and letters) will be removed
def remove_symbols(text):
    # Remove all characters except numbers and letters
    return re.sub(r'[^a-zA-Z0-9]', '', text)

chebi2cleannames = dict()
for one_chebi in chebis:
    if 'oboI:id' in one_chebi:
        primary_id = one_chebi['oboI:id']['#text'].replace('CHEBI:', '')
        synonyms = set()  # Using a set to remove duplicates
        # standard name
        if 'rdfs:label' in one_chebi:
            standard_name = one_chebi['rdfs:label']['#text'].lower()
            clean_standard_name = remove_symbols(standard_name)
            if clean_standard_name:
                synonyms.add(clean_standard_name)
        # synonyms
        # 'oboI:hasExactSynonym'
        if 'oboI:hasExactSynonym' in one_chebi:
            exact_synonyms = one_chebi['oboI:hasExactSynonym']
            if isinstance(exact_synonyms, list):  # If it's a list, extract all texts
                for syn in exact_synonyms:
                    clean_syn = remove_symbols(syn['#text'].lower())
                    if clean_syn:
                        synonyms.add(clean_syn)
            else:  # If it's a single dictionary, extract its text
                clean_syn = remove_symbols(exact_synonyms['#text'].lower())
                if clean_syn:
                    synonyms.add(clean_syn)
        # 'oboI:hasRelatedSynonym'
        if 'oboI:hasRelatedSynonym' in one_chebi:
            related_synonyms = one_chebi['oboI:hasRelatedSynonym']
            if isinstance(related_synonyms, list):  # If it's a list, extract all texts
                for syn in related_synonyms:
                    clean_syn = remove_symbols(syn['#text'].lower())
                    if clean_syn:
                        synonyms.add(clean_syn)
            else:  # If it's a single dictionary, extract its text
                clean_syn = remove_symbols(related_synonyms['#text'].lower())
                if clean_syn:
                    synonyms.add(clean_syn)
        if not synonyms:
            continue
        chebi2cleannames[primary_id] = list(synonyms)  # Convert set back to list
len(chebi2cleannames)

203109

In [22]:
chebi2cleannames

{'10': ['atherospermoline'],
 '100': ['medicarpin', '6ar11ar9methoxy6a11adihydro6h1benzofuro32cchromen3ol'],
 '10000': ['vismioned'],
 '100000': ['2s3s4r343cyclopentylprop1ynylphenyl4hydroxymethyl12methoxy1oxoethyl2azetidinecarbonitrile'],
 '100001': ['n2r3s6r2hydroxymethyl62oxo4trifluoromethylanilinomethylaminoethyl3oxanyl3pyridinecarboxamide'],
 '100002': ['3chloron5s6s9s5methoxy369trimethyl2oxo11oxa38diazabicyclo1040hexadeca1121315trien14ylbenzenesulfonamide'],
 '100003': ['4r7s8r8methoxy4710trimethyl11oxo141oxobutylaminonpropyl2oxa510diazabicyclo1040hexadeca1121315triene5carboxamide'],
 '100004': ['125difluorophenyl35s6s9s5methoxy369trimethyl2oxo8oxo2pyrazinylmethyl11oxa38diazabicyclo1040hexadeca1121315trien14ylurea'],
 '100005': ['n1s3s4as9ar1hydroxymethyl32oxo21piperidinylethyl344a9atetrahydro1hpyrano34bbenzofuran6yl3methoxybenzenesulfonamide'],
 '100006': ['n13benzodioxol5ylmethyl22r3r6s325difluoroanilinooxomethylamino2hydroxymethyl36dihydro2hpyran6ylacetamide'],
 '100007': ['ls

In [23]:
with open(os.path.join(DATA_DIR, 'chebi2cleannames_9jun2025.lzma'), 'wb') as handle:
    compress_pickle.dump(chebi2cleannames, handle, compression="lzma", set_default_extension=False)

### Create NAMES2CHEBI


In [24]:
# Create an exact match index for ChEBI synonyms
def create_chebi_exact_match_index(chebi2names):
    """
    Create an efficient exact match index for ChEBI synonym lookup
    
    Parameters
    ----------
    chebi2names: dict
        Dictionary mapping ChEBI IDs to lists of names
    
    Returns
    -------
    dict
        Maps normalized synonyms to lists of ChEBI IDs
    """
    exact_match_index = {}
    
    for chebi_id, synonyms in chebi2names.items():
        for synonym in synonyms:
            # Create exact match index
            norm_synonym = synonym.lower()
            if norm_synonym not in exact_match_index:
                exact_match_index[norm_synonym] = []
            if chebi_id not in exact_match_index[norm_synonym]:
                exact_match_index[norm_synonym].append(chebi_id)
    
    return exact_match_index

# Create the index
exact_match_index = create_chebi_exact_match_index(chebi2names)

print(f"Exact match index has {len(exact_match_index)} entries")

with open(os.path.join(DATA_DIR, 'names2chebi_9jun2025.lzma'), 'wb') as handle:
    compress_pickle.dump(exact_match_index, handle, 
                       compression="lzma", set_default_extension=False)

Exact match index has 541835 entries


In [25]:
# without symbols
exact_match_index = create_chebi_exact_match_index(chebi2cleannames)

print(f"Exact match index has {len(exact_match_index)} entries")

with open(os.path.join(DATA_DIR, 'cleannames2chebi_9jun2025.lzma'), 'wb') as handle:
    compress_pickle.dump(exact_match_index, handle, 
                        compression="lzma", set_default_extension=False)

Exact match index has 522387 entries


### Create chebi2prime

Map secondary chebi ids to prime chebi ids, this was used to map secondary chebi ids to prime chebi ids in the RHEA reference file.    
    
However, after testing, I found that most of the rhea reference file used the prime chebi ids, and there are only two cases where the secondary chebi ids were used. Therefore, this mapping is not used.

In [13]:
second2prime_dict = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text']
    secondary_id = []
    if 'oboI:hasAlternativeId' in one_chebi:
        alt_ids = one_chebi['oboI:hasAlternativeId']
        
        # If it's a dictionary (single alternative ID)
        if isinstance(alt_ids, dict):
            secondary_id.append(alt_ids['#text'])
        
        # If it's a list (multiple alternative IDs)
        elif isinstance(alt_ids, list):
            secondary_id = [val['#text'] for val in alt_ids]
    else:
      secondary_id = None
    second2prime_dict[primary_id] = primary_id
    if secondary_id:
      for one_secondary in secondary_id:
        second2prime_dict[one_secondary] = primary_id

In [15]:
with open(os.path.join(DATA_DIR, 'chebi_second2prime_jan2025.lzma'), 'wb') as handle:
    compress_pickle.dump(second2prime_dict, handle, compression="lzma", set_default_extension=False)