# Create and process custom DB of modified bile acids (unique)
Author: Louis Felix Nothias (UCSD). I reused a notebook from Ricardo Silva (UCSD) for NAP (custom-DB) https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006089

## Step 1 Run SmiLib v2.0 
http://melolab.org/smilib/

In [3]:
!python

Python 2.7.15 |Anaconda custom (64-bit)| (default, Dec 14 2018, 13:10:39) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> 
KeyboardInterrupt
>>> 
>>> 

## Step 2 - Process the results

In [2]:
import pandas as pd    
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdMD

In [24]:
df = pd.read_csv("SmiLib_output/output_combinatorial_library_unique_v4.txt", sep='\t', header=None)
df.rename(columns={0:'name'}, inplace=True)
df.rename(columns={1:'SMILES'}, inplace=True)
df.head(3)

Unnamed: 0,name,SMILES
0,24C-4O-carboxyl1unmodified.1_1,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
1,24C-4O-carboxyl1unmodified.1_2,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
2,24C-4O-carboxyl1unmodified.1_3,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...


In [25]:
df.shape

(7007, 2)

In [27]:
# Concat the two table (modified and non modifiable)
dfA = pd.read_csv("SmiLib_v2.0rc4/1902_bile_acids_hydroxyls_unique_carboxyls_only_unmodified.smi", sep='\t', header=None)
dfA.rename(columns={0:'name'}, inplace=True)
dfA.rename(columns={1:'SMILES'}, inplace=True)

dfX = pd.concat([df, dfA])
print(dfX.shape)
dfX.head(5)

(7527, 2)


Unnamed: 0,name,SMILES
0,24C-4O-carboxyl1unmodified.1_1,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
1,24C-4O-carboxyl1unmodified.1_2,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
2,24C-4O-carboxyl1unmodified.1_3,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
3,24C-4O-carboxyl1unmodified.1_4,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...
4,24C-4O-carboxyl1unmodified.1_5,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...


In [28]:
# Drop duplicate SMILES metabolites to speed up things
df2 = dfX[['name','SMILES']]
df2 = df2.drop_duplicates(['SMILES'])
df2.shape

(7011, 2)

In [29]:
smi = list(df2['SMILES'])
len(smi)

7011

In [30]:
# Make a function to compute compound metadata

def getCPDmetadata(smi_struct):    
    try:
        #smiles_string = smi_struct
        yo = Chem.MolFromSmiles(smi_struct)
        inchi = Chem.rdinchi.MolToInchi(yo)
        ikey = Chem.rdinchi.InchiToInchiKey(inchi[0])
        ikey1 = ikey.split('-')[0]
        ikey2 = ikey.split('-')[1]
        form = rdMD.CalcMolFormula(yo)
        exmass = rdMD.CalcExactMolWt(yo)
        
        #smiles_list.append(smiles_string)
        inchi_list.append(inchi[0])
        ikey_list.append(ikey)       
        ikey1_list.append(ikey1)
        ikey2_list.append(ikey2)
        form_list.append(form)
        exmass_list.append(exmass)

    except:
        #smiles_list.append('')
        inchi_list.append('')
        ikey_list.append('')       
        ikey1_list.append('')
        ikey2_list.append('')
        form_list.append('')
        exmass_list.append('')

    #return [ikey, exmass, inchi[0], smi_struct, ikey2, ikey1, form, '', '', '', ''] 

In [31]:
# Create / clean the objects
yo = []
smiles_list = []
inchi = []
ikey = []
ikey1 = []
ikey2 = []
form = []
exmass = []
inchi_list = []
ikey_list = []
ikey1_list = []
ikey2_list = []
form_list = []
exmass_list = []

# Run the def function
for i in smi:
    getCPDmetadata(i)

In [32]:
# Make the table
data = {'inchikey': ikey_list, 'MonoisotopicMass': exmass_list, 'InChI': inchi_list, 'SMILES': list(df2['SMILES']),
              'Identifier': list(df2['name']), 'InChIKey2': ikey1_list, 'InChIKey1': ikey2_list, 'MolecularFormula': form_list}

cn = ["inchikey", "MonoisotopicMass", "InChI", "SMILES", "Identifier", "InChIKey2", "InChIKey1", "MolecularFormula"]
formdata = pd.DataFrame(data, columns=cn)
formdata.shape

(7011, 8)

In [33]:
formdata.head(5)

Unnamed: 0,inchikey,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula
0,WZLRQQRJNRMKIW-UHFFFAOYSA-N,495.32,InChI=1S/C27H45NO7/c1-13(5-8-21(31)28-14(2)25(...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_1,WZLRQQRJNRMKIW,UHFFFAOYSA,C27H45NO7
1,BAFYFSONXMCIKD-UHFFFAOYSA-N,580.384,InChI=1S/C30H52N4O7/c1-15(6-9-23(37)34-21(27(4...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_2,BAFYFSONXMCIKD,UHFFFAOYSA,C30H52N4O7
2,XMSIEECKJXZZDY-UHFFFAOYSA-N,538.325,InChI=1S/C28H46N2O8/c1-13(4-7-22(34)30-19(26(3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_3,XMSIEECKJXZZDY,UHFFFAOYSA,C28H46N2O8
3,KLIROXLRHRSESN-UHFFFAOYSA-N,539.309,InChI=1S/C28H45NO9/c1-13(4-7-21(32)29-19(26(37...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_4,KLIROXLRHRSESN,UHFFFAOYSA,C28H45NO9
4,MAGMUIMMWBDOBL-UHFFFAOYSA-N,527.292,InChI=1S/C27H45NO7S/c1-13(4-7-21(31)28-19(12-3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_5,MAGMUIMMWBDOBL,UHFFFAOYSA,C27H45NO7S


## Step 3 - Recovering the name of building blocks
Note this is hardcoded, so be very carefull

In [34]:
# BUILDING BLOCKS
formdata['Identifier'] = formdata['Identifier'].str.replace('_10', '_isoleucine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_11', '_leucine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_12', '_lysine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_13', '_methionine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_14', '_phenylalanine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_15', '_proline')
formdata['Identifier'] = formdata['Identifier'].str.replace('_16', '_serine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_17', '_threonine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_18', '_tryptophan')
formdata['Identifier'] = formdata['Identifier'].str.replace('_19', '_tyrosine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_20', '_valine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_21', '_selenocysteine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_22', '_pyrrolysine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_23', '_tauric acid')
#===
formdata['Identifier'] = formdata['Identifier'].str.replace('_24', '_cysteate')
formdata['Identifier'] = formdata['Identifier'].str.replace('_25', '_homocysteate')
formdata['Identifier'] = formdata['Identifier'].str.replace('_26', '_homocysteine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_27', '_glucamine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_28', '_serinol')
#====
formdata['Identifier'] = formdata['Identifier'].str.replace('_29', '_free_acid')
formdata['Identifier'] = formdata['Identifier'].str.replace('_30', '_peracid')
formdata['Identifier'] = formdata['Identifier'].str.replace('_31', '_methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_32', '_ethylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_33', '_propylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_34', '_butylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_35', '_pentalytated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_36', '_hexalytated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_37', '_heptalytated')
#==============
formdata['Identifier'] = formdata['Identifier'].str.replace('_38', '_alanine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_39', '_arginine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_40', '_asparagine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_41', '_aspartic acid-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_42', '_cysteine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_43', '_glutamic acid-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_44', '_glutamine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_45', '_glycine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_46', '_histidine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_47', '_isoleucine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_48', '_leucine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_49', '_lysine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_50', '_methionine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_51', '_phenylalanine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_52', '_serine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_53', '_threonine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_54', '_tryptophan-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_55', '_tyrosine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_56', '_valine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_57', '_selenocysteine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_58', '_pyrrolysine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_59', '_tauric acid-N-Methylated')
#===
formdata['Identifier'] = formdata['Identifier'].str.replace('_60', '_cysteate-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_61', '_homocysteate-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_62', '_homocysteine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_63', '_glucamine-N-Methylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_64', '_serinol-N-Methylated')
#====

formdata['Identifier'] = formdata['Identifier'].str.replace('_65', '_alanine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_66', '_arginine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_67', '_asparagine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_68', '_aspartic acid-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_69', '_cysteine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_70', '_glutamic acid-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_71', '_glutamine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_72', '_glycine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_73', '_histidine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_74', '_isoleucine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_75', '_leucine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_76', '_lysine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_77', '_methionine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_78', '_phenylalanine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_79', '_serine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_80', '_threonine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_81', '_tryptophan-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_82', '_tyrosine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_83', '_valine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_84', '_selenocysteine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_85', '_pyrrolysine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_86', '_tauric acid-N-hydroxylated')
#===
formdata['Identifier'] = formdata['Identifier'].str.replace('_87', '_cysteate-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_88', '_homocysteate-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_89', '_homocysteine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_90', '_glucamine-N-hydroxylated')
formdata['Identifier'] = formdata['Identifier'].str.replace('_91', '_serinol-N-hydroxylated')
#====
formdata['Identifier'] = formdata['Identifier'].str.replace('_1', '_alanine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_2', '_arginine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_3', '_asparagine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_4', '_aspartic acid')
formdata['Identifier'] = formdata['Identifier'].str.replace('_5', '_cysteine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_6', '_glutamic acid')
formdata['Identifier'] = formdata['Identifier'].str.replace('_7', '_glutamine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_8', '_glycine')
formdata['Identifier'] = formdata['Identifier'].str.replace('_9', '_histidine')

formdata.head(15)

Unnamed: 0,inchikey,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula
0,WZLRQQRJNRMKIW-UHFFFAOYSA-N,495.32,InChI=1S/C27H45NO7/c1-13(5-8-21(31)28-14(2)25(...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_alanine,WZLRQQRJNRMKIW,UHFFFAOYSA,C27H45NO7
1,BAFYFSONXMCIKD-UHFFFAOYSA-N,580.384,InChI=1S/C30H52N4O7/c1-15(6-9-23(37)34-21(27(4...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_arginine,BAFYFSONXMCIKD,UHFFFAOYSA,C30H52N4O7
2,XMSIEECKJXZZDY-UHFFFAOYSA-N,538.325,InChI=1S/C28H46N2O8/c1-13(4-7-22(34)30-19(26(3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_asparagine,XMSIEECKJXZZDY,UHFFFAOYSA,C28H46N2O8
3,KLIROXLRHRSESN-UHFFFAOYSA-N,539.309,InChI=1S/C28H45NO9/c1-13(4-7-21(32)29-19(26(37...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_aspartic acid,KLIROXLRHRSESN,UHFFFAOYSA,C28H45NO9
4,MAGMUIMMWBDOBL-UHFFFAOYSA-N,527.292,InChI=1S/C27H45NO7S/c1-13(4-7-21(31)28-19(12-3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_cysteine,MAGMUIMMWBDOBL,UHFFFAOYSA,C27H45NO7S
5,ROCSDBWOTGPGBL-UHFFFAOYSA-N,553.325,InChI=1S/C29H47NO9/c1-14(4-8-22(33)30-20(27(38...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_glutamic acid,ROCSDBWOTGPGBL,UHFFFAOYSA,C29H47NO9
6,UFVGETNMUWKCEG-UHFFFAOYSA-N,552.341,InChI=1S/C29H48N2O8/c1-14(4-9-23(35)31-20(27(3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_glutamine,UFVGETNMUWKCEG,UHFFFAOYSA,C29H48N2O8
7,NFPNFEUFLYLQBM-UHFFFAOYSA-N,481.304,InChI=1S/C26H43NO7/c1-13(4-7-20(30)27-12-21(31...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_glycine,NFPNFEUFLYLQBM,UHFFFAOYSA,C26H43NO7
8,SUVKRFMBXSZUOH-UHFFFAOYSA-N,561.341,InChI=1S/C30H47N3O7/c1-15(4-7-24(36)33-22(28(3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_histidine,SUVKRFMBXSZUOH,UHFFFAOYSA,C30H47N3O7
9,PVXXEZWFUMUDPD-UHFFFAOYSA-N,537.367,InChI=1S/C30H51NO7/c1-6-15(2)25(28(37)38)31-23...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_isoleucine,PVXXEZWFUMUDPD,UHFFFAOYSA,C30H51NO7


In [40]:
formdata['Identifier'][1585]

'24C-3O-carboxyl8unmodified.1_arginine-N-Methylated'

In [41]:
formdata2 = formdata
formdata2['kingdom_name'] = 'Organic compounds'
formdata2['superclass_name'] = 'Lipids and lipid-like molecules'
formdata2['class_name'] = 'Steroids and steroid derivatives'
formdata2['subclass_name'] = 'Bile acids, alcohols and derivatives'
formdata2 = formdata2.fillna('')
formdata2.drop('inchikey', axis=1, inplace=True)
formdata2.shape

(7011, 11)

In [42]:
formdata2.head(5)

Unnamed: 0,MonoisotopicMass,InChI,SMILES,Identifier,InChIKey2,InChIKey1,MolecularFormula,kingdom_name,superclass_name,class_name,subclass_name
0,495.32,InChI=1S/C27H45NO7/c1-13(5-8-21(31)28-14(2)25(...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_alanine,WZLRQQRJNRMKIW,UHFFFAOYSA,C27H45NO7,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,"Bile acids, alcohols and derivatives"
1,580.384,InChI=1S/C30H52N4O7/c1-15(6-9-23(37)34-21(27(4...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_arginine,BAFYFSONXMCIKD,UHFFFAOYSA,C30H52N4O7,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,"Bile acids, alcohols and derivatives"
2,538.325,InChI=1S/C28H46N2O8/c1-13(4-7-22(34)30-19(26(3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_asparagine,XMSIEECKJXZZDY,UHFFFAOYSA,C28H46N2O8,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,"Bile acids, alcohols and derivatives"
3,539.309,InChI=1S/C28H45NO9/c1-13(4-7-21(32)29-19(26(37...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_aspartic acid,KLIROXLRHRSESN,UHFFFAOYSA,C28H45NO9,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,"Bile acids, alcohols and derivatives"
4,527.292,InChI=1S/C27H45NO7S/c1-13(4-7-21(31)28-19(12-3...,CC(C1CCC2C1(C)C(O)CC3C2C(O)C(O)C4C3(C)CCC(O)C4...,24C-4O-carboxyl1unmodified.1_cysteine,MAGMUIMMWBDOBL,UHFFFAOYSA,C27H45NO7S,Organic compounds,Lipids and lipid-like molecules,Steroids and steroid derivatives,"Bile acids, alcohols and derivatives"


## Last step - Export the results
Note that this is the NAP format. For SIRIUS results make a two column tab-separated file with ID nd SMILES.

In [45]:
formdata2.to_csv('Processed_database/1902_Bile_acids_combinatorial_COSMIC_unique_AA_v190318.txt', index=False, sep='\t')

## Check one entry

In [43]:
formdata2.iloc[1500]

MonoisotopicMass                                              479.325
InChI               InChI=1S/C27H45NO6/c1-15(5-6-22(32)28(4)14-23(...
SMILES              CC(C1C(O)CC2C1(C)CCC3C2C(O)CC4C3(C)CCC(O)C4)CC...
Identifier          24C-3O-carboxyl7unmodified.1_glycine-N-Methylated
InChIKey2                                              HZPYTIPWBXFHQC
InChIKey1                                                  UHFFFAOYSA
MolecularFormula                                            C27H45NO6
kingdom_name                                        Organic compounds
superclass_name                       Lipids and lipid-like molecules
class_name                           Steroids and steroid derivatives
subclass_name                    Bile acids, alcohols and derivatives
Name: 1500, dtype: object