# Developing Methods to Generate/Retrieve mol Files to provide a Full Chemical Specification of all Molecules in the openTECR dataset

In [5]:
import os
import pandas as pd
import re
import subprocess
from rdkit import Chem
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# import the necessary csv files - to extract info for further processing
mapped_ids = '/home/jackmcgoldrick/openTECR/results/openTECR_mapped_metabolites.csv'

initial_data = '/home/jackmcgoldrick/openTECR/data/TECRDB.csv'

In [3]:
# convert mapped data to a df
mapped_mets = pd.read_csv(mapped_ids)

In [4]:
mapped_mets.head()

Unnamed: 0,KEGG_ID,CHEBI,BiGG Metabolite,HMDB,MetaCyc,SABIORK,SEED,MNX,VMH,Reactome,SwissLipids,EnviPath,Extra Identifiers
0,C04752,CHEBI:16629; CHEBI:57841; CHEBI_11612; CHEBI_1...,bigg.metabolite:2mahmp,hmdb:HMDB0304168,metacyc.compound:AMINO-HYDROXYMETHYL-METHYLPYR...,sabiork.compound:1832; newSearch?q=1832,seed.compound:cpd02894; M_cpd02894; cpd02894,MNXM1101289,,,,,2mahmp; M_2mahmp; M_C04752; compound?id=AMINO-...
1,C00407,CHEBI:17191; CHEBI:32604; CHEBI:32605; CHEBI:3...,bigg.metabolite:ile__L,hmdb:HMDB0000172; hmdb:HMDB00172; hmdb:HMDB003...,metacyc.compound:CPD-12149; metacyc.compound:C...,sabiork.compound:23116; sabiork.compound:70; n...,seed.compound:cpd00322; seed.compound:cpd15139...,MNXM1366448,vmhmetabolite:ile_L,reactome:R-ALL-113537; reactome:R-ALL-30102; r...,,,M_ile__L; ile-L; ile__L; kegg.drug:D00065; C16...
2,C01545,CHEBI:17935; CHEBI_11268; CHEBI_17935; CHEBI_2...,bigg.metabolite:octal,hmdb:HMDB0001140; hmdb:HMDB01140,metacyc.compound:CPD-371,sabiork.compound:5652; newSearch?q=5652,seed.compound:cpd01088; seed.compound:cpd15619...,MNXM2705,,,SLM:000389951; SLM:000389951,9dcb30dc-d751-4434-8d9c-68b4f5453145; 8421e286...,M_octal; octal; 9dcb30dc-d751-4434-8d9c-68b4f5...
3,C00417,CHEBI:16383; CHEBI:32805; CHEBI_10482; CHEBI_1...,bigg.metabolite:HC00342; bigg.metabolite:acon;...,hmdb:HMDB0000072; hmdb:HMDB0000461; hmdb:HMDB0...,metacyc.compound:CIS-ACONITATE,sabiork.compound:2043; newSearch?q=2043,seed.compound:cpd00331; M_cpd00331; cpd00331,MNXM1092518,vmhmetabolite:HC00342; vmhmetabolite:acon_C,,,643481e5-a35b-477e-8665-70f4dca66baa,HC00342; M_HC00342; M_acon; M_acon_C; acon; ac...
4,C00197,CHEBI:17050; CHEBI:17794; CHEBI:57998; CHEBI:5...,bigg.metabolite:3pg,hmdb:HMDB0000807; hmdb:HMDB0060180; hmdb:HMDB0...,metacyc.compound:G3P,sabiork.compound:21216; sabiork.compound:30; n...,seed.compound:cpd00169; seed.compound:cpd30741...,MNXM727604,vmhmetabolite:3pg,reactome:R-ALL-29728; reactome:R-ALL-6799493,SLM:000489958; SLM:000489958,47064115-384f-43ee-b9e2-5e9d7aed5217,3pg; M_3pg; 47064115-384f-43ee-b9e2-5e9d7aed52...


In [5]:
def mnx_inchi(mnx_id):
    """
    Function to query the ReconX Knowledge Graph for the InChI string
    associated with a given MNX identifier.
    """

    # Define the SPARQL endpoint
    sparql_endpoint = "http://sbg:7200/repositories/ReconXKG"
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setReturnFormat(JSON)

    # SPARQL query with dynamic MNX ID
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX mnx: <https://rdf.metanetx.org/schema/>
    PREFIX chem: <https://rdf.metanetx.org/chem/>

    SELECT ?mnx ?inchi
    WHERE {{
        ?mnx a mnx:CHEM ;
             mnx:inchi ?inchi ;
             rdfs:label ?mnx_identifier .

        FILTER(REGEX(STR(?mnx_identifier), "^{mnx_id}$"))
    }}
    """

    # Set the query
    sparql.setQuery(query)

    # Execute and process the results
    try:
        results = sparql.query().convert()
        bindings = results["results"]["bindings"]

        # Extract and return InChI strings
        inchi_list = [entry["inchi"]["value"] for entry in bindings]
        return inchi_list

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [6]:
mnx_id = "MNXM2705"
inchi_strings = mnx_inchi(mnx_id)
print(f"InChI strings for {mnx_id}: {inchi_strings}")


InChI strings for MNXM2705: ['InChI=1S/C8H16O/c1-2-3-4-5-6-7-8-9/h8H,2-7H2,1H3']


In [9]:
# Add a new column for InChIs
mapped_mets["InChI"] = mapped_mets["MNX"].apply(lambda mnx_id: "; ".join(mnx_inchi(mnx_id)))

print(mapped_mets)


    KEGG_ID                                              CHEBI  \
0    C04752  CHEBI:16629; CHEBI:57841; CHEBI_11612; CHEBI_1...   
1    C00407  CHEBI:17191; CHEBI:32604; CHEBI:32605; CHEBI:3...   
2    C01545  CHEBI:17935; CHEBI_11268; CHEBI_17935; CHEBI_2...   
3    C00417  CHEBI:16383; CHEBI:32805; CHEBI_10482; CHEBI_1...   
4    C00197  CHEBI:17050; CHEBI:17794; CHEBI:57998; CHEBI:5...   
..      ...                                                ...   
598  C04105  CHEBI:18080; CHEBI:48068; CHEBI:58372; CHEBI:5...   
599  C02588  CHEBI:17616; CHEBI_12784; CHEBI_17616; CHEBI_2...   
600  C03127                                                NaN   
601  C00041  CHEBI_57972; CHEBI:16977; CHEBI:229589; CHEBI:...   
602  C00137  CHEBI:17268; CHEBI_10601; CHEBI_12826; CHEBI_1...   

                                       BiGG Metabolite  \
0                               bigg.metabolite:2mahmp   
1                               bigg.metabolite:ile__L   
2                                

In [10]:
# save to csv file
mapped_mets.to_csv("/home/jackmcgoldrick/openTECR/results/mapping_openTECR_inchis.csv")

## Retrieving KEGG related mol Files 

This section will attempt to access the set of mol Files, from KEGG, related to the identifiers above. Later, a structure check will be implemented (likely with the inchi1 software) to ensure the molecule structures have fully defined stereochemistry, and to ensure other structural identifiers are consistent. Of course, some molecules, such as macrocycles will cause more difficulties than other linear/branched molecules. 

**HOLD OFF ON THIS FOR NOW, CANNOT ACCESS 2013 API CURRENTLY VIA WEBARCHIVES.ORG**

## Generating molFiles directly from retrieved InChIKeys

In [27]:
# call the csv file containing inchis - error with above code, will regnerate later
csv_path = "/home/jackmcgoldrick/openTECR/results/structural_mapping/mapping_openTECR_inchis.csv"

In [28]:
# generate the df
mapped_inchis = pd.read_csv(csv_path)

In [29]:
mapped_inchis.head()

Unnamed: 0.1,Unnamed: 0,KEGG_ID,CHEBI,BiGG Metabolite,HMDB,MetaCyc,SABIORK,SEED,MNX,VMH,Reactome,SwissLipids,EnviPath,Extra Identifiers,InChI
0,0,C04752,CHEBI:16629; CHEBI:57841; CHEBI_11612; CHEBI_1...,bigg.metabolite:2mahmp,hmdb:HMDB0304168,metacyc.compound:AMINO-HYDROXYMETHYL-METHYLPYR...,sabiork.compound:1832; newSearch?q=1832,seed.compound:cpd02894; M_cpd02894; cpd02894,MNXM1101289,,,,,2mahmp; M_2mahmp; M_C04752; compound?id=AMINO-...,InChI=1S/C6H11N3O7P2/c1-4-8-2-5(6(7)9-4)3-15-1...
1,1,C00407,CHEBI:17191; CHEBI:32604; CHEBI:32605; CHEBI:3...,bigg.metabolite:ile__L,hmdb:HMDB0000172; hmdb:HMDB00172; hmdb:HMDB003...,metacyc.compound:CPD-12149; metacyc.compound:C...,sabiork.compound:23116; sabiork.compound:70; n...,seed.compound:cpd00322; seed.compound:cpd15139...,MNXM1366448,vmhmetabolite:ile_L,reactome:R-ALL-113537; reactome:R-ALL-30102; r...,,,M_ile__L; ile-L; ile__L; kegg.drug:D00065; C16...,"InChI=1S/C6H13NO2/c1-3-4(2)5(7)6(8)9/h4-5H,3,7..."
2,2,C01545,CHEBI:17935; CHEBI_11268; CHEBI_17935; CHEBI_2...,bigg.metabolite:octal,hmdb:HMDB0001140; hmdb:HMDB01140,metacyc.compound:CPD-371,sabiork.compound:5652; newSearch?q=5652,seed.compound:cpd01088; seed.compound:cpd15619...,MNXM2705,,,SLM:000389951; SLM:000389951,9dcb30dc-d751-4434-8d9c-68b4f5453145; 8421e286...,M_octal; octal; 9dcb30dc-d751-4434-8d9c-68b4f5...,"InChI=1S/C8H16O/c1-2-3-4-5-6-7-8-9/h8H,2-7H2,1H3"
3,3,C00417,CHEBI:16383; CHEBI:32805; CHEBI_10482; CHEBI_1...,bigg.metabolite:HC00342; bigg.metabolite:acon;...,hmdb:HMDB0000072; hmdb:HMDB0000461; hmdb:HMDB0...,metacyc.compound:CIS-ACONITATE,sabiork.compound:2043; newSearch?q=2043,seed.compound:cpd00331; M_cpd00331; cpd00331,MNXM1092518,vmhmetabolite:HC00342; vmhmetabolite:acon_C,,,643481e5-a35b-477e-8665-70f4dca66baa,HC00342; M_HC00342; M_acon; M_acon_C; acon; ac...,InChI=1S/C6H6O6/c7-4(8)1-3(6(11)12)2-5(9)10/h1...
4,4,C00197,CHEBI:17050; CHEBI:17794; CHEBI:57998; CHEBI:5...,bigg.metabolite:3pg,hmdb:HMDB0000807; hmdb:HMDB0060180; hmdb:HMDB0...,metacyc.compound:G3P,sabiork.compound:21216; sabiork.compound:30; n...,seed.compound:cpd00169; seed.compound:cpd30741...,MNXM727604,vmhmetabolite:3pg,reactome:R-ALL-29728; reactome:R-ALL-6799493,SLM:000489958; SLM:000489958,47064115-384f-43ee-b9e2-5e9d7aed5217,3pg; M_3pg; 47064115-384f-43ee-b9e2-5e9d7aed52...,"InChI=1S/C3H7O7P/c4-2(3(5)6)1-10-11(7,8)9/h2,4..."


In [10]:
# first determine how many metabolites had an inchi returned
inchi_absent = mapped_inchis[mapped_inchis["InChI"].isnull()]

In [11]:
inchi_absent

Unnamed: 0.1,Unnamed: 0,KEGG_ID,CHEBI,BiGG Metabolite,HMDB,MetaCyc,SABIORK,SEED,MNX,VMH,Reactome,SwissLipids,EnviPath,Extra Identifiers,InChI
13,13,C01653,CHEBI:29183; CHEBI_10694; CHEBI_15191; CHEBI_2...,bigg.metabolite:trnaval,,metacyc.compound:VAL-tRNAs,,seed.compound:cpd11924; seed.compound:cpd28318...,MNXM90885,,,,,M_trnaval; trnaval; M_C01653; compound?id=VAL-...,
20,20,C01277,CHEBI_58178; CHEBI:58178; CHEBI_61083,bigg.metabolite:pail4p_hs,,,,,MNXM1103886,vmhmetabolite:pail4p_hs,,SLM:000000346; SLM:000000346,,M_pail4p_hs; pail4p_hs; M_C01277; pail4p_hs,
33,33,C00787,CHEBI:29182; CHEBI_10692; CHEBI_15189; CHEBI_2...,bigg.metabolite:trnatyr,,metacyc.compound:TYR-tRNAs,sabiork.compound:3838; newSearch?q=3838,seed.compound:cpd11751; M_cpd11751; cpd11751,MNXM90668,,,,,M_trnatyr; trnatyr; M_C00787; compound?id=TYR-...,
45,45,C01636,CHEBI:29171; CHEBI_10673; CHEBI_15167; CHEBI_2...,bigg.metabolite:trnaarg,,metacyc.compound:ARG-tRNAs,sabiork.compound:3837; newSearch?q=3837,seed.compound:cpd11907; seed.compound:cpd22281...,MNXM90751,,,,,M_trnaarg; trnaarg; M_C01636; compound?id=ARG-...,
46,46,C04618,,,,,,seed.compound:cpd11478; M_cpd11478; cpd11478,MNXM728726,vmhmetabolite:3hbacp; vmhmetabolite:HC01321,,,,M_C04618; 3hbacp; HC01321,
51,51,C02988,,,,,,,MNXM741218,,,,,M_C02988,
54,54,C00126,,,,,,,MNXM731957,,,,,M_C00126,
63,63,C03875,,,,metacyc.compound:CPD-8563,,seed.compound:cpd12412; M_cpd12412; cpd12412,MNXM92465,,,,,M_C03875; compound?id=CPD-8563,
111,111,C06020,CHEBI:27833; CHEBI_25292; CHEBI_27833; CHEBI_6870,,,,,seed.compound:cpd12809; M_cpd12809; cpd12809,MNXM5822,vmhmetabolite:mecfsp,,,,M_C06020; mecfsp,
147,147,C04246,CHEBI:132146; CHEBI_132146,,,,,seed.compound:cpd11465; M_cpd11465; cpd11465,MNXM728706,vmhmetabolite:2beacp; vmhmetabolite:HC01255,,,,M_C04246; 2beacp; HC01255,


In [16]:
print(len(inchi_absent))

39


For cases where there is no inchi for a molecule, could either try to retrieve a SMILES, and generate structures this way, or download mol files directly from KEGG, if they exist. Manual check first for prsence etc.

- Use this newly generated dataframe **"inchi_absent"**, to return SMILES/molFiles for those molecules only where it is required 
- **NOTE** Will be returning molfiles from kegg regardless, but just incase they do not exist/ are not defined well , SMILES could be useful to have also

### Returning SMILES for the Subset of molecules with no InChI string

In [17]:
def mnx_smiles(mnx_id):
    """
    Function to query the ReconX Knowledge Graph for the InChI string
    associated with a given MNX identifier.
    """

    # Define the SPARQL endpoint
    sparql_endpoint = "http://sbg:7200/repositories/ReconXKG"
    sparql = SPARQLWrapper(sparql_endpoint)
    sparql.setReturnFormat(JSON)

    # SPARQL query with dynamic MNX ID
    query = f"""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX mnx: <https://rdf.metanetx.org/schema/>
    
    select ?mnx ?smiles
    where{{
    ?mnx a mnx:CHEM ;
    	 mnx:smiles ?smiles ;
    	 rdfs:label ?mnx_label . 
    FILTER(REGEX(?mnx_label, "^{mnx_id}$"))
    }}
    """

    # Set the query
    sparql.setQuery(query)

    # Execute and process the results
    try:
        results = sparql.query().convert()
        bindings = results["results"]["bindings"]

        # Extract and return InChI strings
        smiles_list = [entry["smiles"]["value"] for entry in bindings]
        return smiles_list

    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [18]:
inchi_absent = inchi_absent.copy()
inchi_absent["SMILES"] = inchi_absent["MNX"].apply(lambda mnx_id: "; ".join(mnx_smiles(mnx_id)))

print(inchi_absent)

     Unnamed: 0 KEGG_ID                                              CHEBI  \
13           13  C01653  CHEBI:29183; CHEBI_10694; CHEBI_15191; CHEBI_2...   
20           20  C01277              CHEBI_58178; CHEBI:58178; CHEBI_61083   
33           33  C00787  CHEBI:29182; CHEBI_10692; CHEBI_15189; CHEBI_2...   
45           45  C01636  CHEBI:29171; CHEBI_10673; CHEBI_15167; CHEBI_2...   
46           46  C04618                                                NaN   
51           51  C02988                                                NaN   
54           54  C00126                                                NaN   
63           63  C03875                                                NaN   
111         111  C06020  CHEBI:27833; CHEBI_25292; CHEBI_27833; CHEBI_6870   
147         147  C04246                         CHEBI:132146; CHEBI_132146   
157         157  C01643  CHEBI:29178; CHEBI_10682; CHEBI_15178; CHEBI_2...   
166         166  C02839                                         

In [19]:
inchi_absent.to_csv("/home/jackmcgoldrick/openTECR/results/structural_mapping/inchis_absent.csv")

## Generating Structures from InChI strings, and also SMILES (where needed)

In [22]:
# InChI string
inchi_string = "InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3"

# Convert InChI to RDKit Mol object
mol = Chem.MolFromInchi(inchi_string)
if mol is None:
    print("Failed to convert InChI to Mol object.")
else:
    # Write to MOL file
    with open("etoh.mol", "w") as mol_file:
        mol_file.write(Chem.MolToMolBlock(mol))
    print("MOL file saved as etoh.mol")


MOL file saved as etoh.mol


Now that we know the code above works to generate a mol file for a simple molecule such as **VMH:etoh**, the code should be adapted to run on the entire dataset of 603 metabolites (minus those which have no inchi key at present)

First run will be to examine those molecules which have stereochemistry, and to determine whether or not stereochemistry is converted readily across from the inchi string to the generated mol file.

In [30]:
def gen_molFile(inchi, output_dir, dataframe, log_file="failures.log"):
    """
    Converts an InChI string into a mol file using RDKit and saves it, with debugging.
    
    Args:
        inchi (str): The InChI string of the molecule.
        output_dir (str): The full path to the directory to save the mol files.
        dataframe (pd.DataFrame): DataFrame containing the "InChI" and "KEGG_ID" columns.
        log_file (str): Path to the log file for recording failures.
    """
    try:
        # Retrieve the KEGG ID associated with the InChI
        row = dataframe[dataframe["InChI"] == inchi]
        if row.empty:
            raise ValueError(f"No KEGG ID found for InChI: {inchi}")
        kegg = row["KEGG_ID"].values[0]

        # Validate the InChI string
        if not inchi or not isinstance(inchi, str):
            raise ValueError(f"Invalid InChI string: {inchi}")

        # Convert the InChI string into an RDKit Mol object
        mol = Chem.MolFromInchi(inchi, sanitize=True, removeHs=True)
        if mol is None:
            raise ValueError(f"RDKit failed to convert InChI: {inchi}")

        # Construct the file path and ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{kegg}.mol")

        # Write the Mol object to a file
        with open(file_path, "w") as mol_file:
            mol_file.write(Chem.MolToMolBlock(mol))
        print(f"MOL file saved as {file_path}")

    except Exception as e:
        # Log the failure details
        with open(log_file, "a") as log:
            log.write(f"Failure for InChI: {inchi}, KEGG_ID: {row['KEGG_ID'].values[0] if not row.empty else 'N/A'}, Error: {e}\n")
        print(f"Error: {e}")


In [31]:
output_directory = "/home/jackmcgoldrick/openTECR/data/molFiles_Final"

for inchi in mapped_inchis["InChI"]:
    gen_molFile(inchi, output_directory, mapped_inchis)


MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C04752.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00407.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01545.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00417.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00197.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00492.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00357.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C14177.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C04442.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C04741.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00469.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00013.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data



MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01131.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01487.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00222.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00683.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00842.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00077.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00058.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00332.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C08126.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C07064.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C05922.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00313.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data

### Counting the Number of molFiles generated 

Expected number of molFiles to be generated == 603-39 == **564 molFiles**

Minus 39 comes from those mols without inchis , which will be generated from their SMILES in subsequent sections

In [32]:
def count_files_in_directory(directory_path):
    try:
        # Get the list of all entries in the directory
        entries = os.listdir(directory_path)
        # Filter out only the files
        file_count = sum(1 for entry in entries if os.path.isfile(os.path.join(directory_path, entry)))
        return file_count
    except FileNotFoundError:
        return "Error: Directory not found."
    except PermissionError:
        return "Error: Permission denied."
    except Exception as e:
        return f"Error: {str(e)}"


In [33]:
# Example usage:
directory_path = "/home/jackmcgoldrick/openTECR/data/molFiles_Final"
file_count = count_files_in_directory(directory_path)
print(f"Number of files in the directory: {file_count}")

Number of files in the directory: 564


Success!!

#### Generating molFiles for those Molecules with SMILES

In [34]:
def gen_molFile_fromSMILES(smiles, output_dir, dataframe, log_file="failures.log"):
    try:
        # Retrieve all KEGG IDs associated with the SMILES
        rows = dataframe[dataframe["SMILES"] == smiles]
        if rows.empty:
            raise ValueError(f"No KEGG ID found for SMILES: {smiles}")
        
        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Iterate over each KEGG ID to create separate MOL files
        for _, row in rows.iterrows():
            kegg = row["KEGG_ID"]
            
            # Validate the SMILES string
            if not smiles or not isinstance(smiles, str):
                raise ValueError(f"Invalid SMILES string: {smiles}")
            
            # Convert the SMILES string into an RDKit Mol object
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                raise ValueError(f"RDKit failed to convert SMILES: {smiles}")
            
            # Construct the file path
            file_path = os.path.join(output_dir, f"{kegg}.mol")
            
            # Write the Mol object to a file
            with open(file_path, "w") as mol_file:
                mol_file.write(Chem.MolToMolBlock(mol))
            print(f"MOL file saved as {file_path}")

    except Exception as e:
        # Log the failure details
        with open(log_file, "a") as log:
            log.write(f"Failure for SMILES: {smiles}, KEGG_ID: {row['KEGG_ID'] if 'kegg' in locals() else 'N/A'}, Error: {e}\n")
        print(f"Error: {e}")


In [35]:
output_directory = "/home/jackmcgoldrick/openTECR/data/molFiles_Final"

for smiles in inchi_absent["SMILES"]:
    gen_molFile_fromSMILES(smiles, output_directory, inchi_absent)


MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01653.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00787.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01636.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01643.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01644.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01648.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01650.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01651.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01646.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01277.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C01653.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_Final/C00787.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data

In [36]:
# Example usage:
directory_path = "/home/jackmcgoldrick/openTECR/data/molFiles_Final"
file_count = count_files_in_directory(directory_path)
print(f"Number of files in the directory: {file_count}")

Number of files in the directory: 602


### Testing a fix to the problem of tRNA molecules

In [None]:
def save_molfile(smiles, dataframe, output_dir, log_file):
    try:
        # Retrieve all KEGG IDs associated with the SMILES
        rows = dataframe[dataframe["SMILES"] == smiles]
        if rows.empty:
            raise ValueError(f"No KEGG ID found for SMILES: {smiles}")
        
        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Iterate over each KEGG ID to create separate MOL files
        for _, row in rows.iterrows():
            kegg = row["KEGG_ID"]
            
            # Validate the SMILES string
            if not smiles or not isinstance(smiles, str):
                raise ValueError(f"Invalid SMILES string: {smiles}")
            
            # Convert the SMILES string into an RDKit Mol object
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                raise ValueError(f"RDKit failed to convert SMILES: {smiles}")
            
            # Construct the file path
            file_path = os.path.join(output_dir, f"{kegg}.mol")
            
            # Write the Mol object to a file
            with open(file_path, "w") as mol_file:
                mol_file.write(Chem.MolToMolBlock(mol))
            print(f"MOL file saved as {file_path}")

    except Exception as e:
        # Log the failure details
        with open(log_file, "a") as log:
            log.write(f"Failure for SMILES: {smiles}, KEGG_ID: {row['KEGG_ID'] if 'kegg' in locals() else 'N/A'}, Error: {e}\n")
        print(f"Error: {e}")



In [25]:
ouput_dir = "/home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3"

for smiles in inchi_absent['SMILES']:
    save_molfile(smiles, inchi_absent, ouput_dir, log_file="failures.log")

MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01653.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C00787.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01636.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01643.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01644.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01648.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01650.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01651.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01646.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01277.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/molFiles_smiles_test3/C01653.mol
MOL file saved as /home/jackmcgoldrick/openTECR/data/m

Now it works correctly 