# EMP Metabolomics - Convert/consolidate structure annotations FBMN



In [1]:
from convert_structures import *

## Load GNPS FBMN annotations

In [2]:
path = "annotations_2_1/FBMN_metabo_feature_metadata_filtered.tsv"

In [3]:
# Spectral-DB file from FBMN, possibly compatible with V2
df = pd.read_csv(path, sep='\t', header=0,low_memory=False)
df.head(2)

Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,48,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,51332_1907_EMPv2_SIRIUS_48,51332_1907_EMPv2_SIRIUS_48,C15H26O3,[M - H4O2 + H]+,Sesquiterpenoids,,Sesquiterpenoids,Prenol lipids,Lipids and lipid-like molecules,Organic compounds; Alcohols and polyols; Preno...
1,66,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,13928_1907_EMPv2_SIRIUS_66,13928_1907_EMPv2_SIRIUS_66,C15H26O3,[M - H4O2 + H]+,Prenol lipids,,,Prenol lipids,Lipids and lipid-like molecules,Organic compounds; Alcohols and polyols; Preno...


## Inspect the metadata

In [4]:
show_metadata_tools(df,'GNPS_LIBA_') #GNPS_LIB_ GNPS_LIBA, DEREP_, DEREP+_, CSI_

['GNPS_LIBA_SpectrumID', 'GNPS_LIBA_Compound_Name', 'GNPS_LIBA_Ion_Source', 'GNPS_LIBA_Instrument', 'GNPS_LIBA_Compound_Source', 'GNPS_LIBA_PI', 'GNPS_LIBA_Data_Collector', 'GNPS_LIBA_Adduct', 'GNPS_LIBA_Precursor_MZ', 'GNPS_LIBA_ExactMass', 'GNPS_LIBA_Charge', 'GNPS_LIBA_CAS_Number', 'GNPS_LIBA_Pubmed_ID', 'GNPS_LIBA_Smiles', 'GNPS_LIBA_INCHI', 'GNPS_LIBA_INCHI_AUX', 'GNPS_LIBA_Library_Class', 'GNPS_LIBA_IonMode', 'GNPS_LIBA_UpdateWorkflowName', 'GNPS_LIBA_LibraryQualityString', 'GNPS_LIBA_SpectrumFile', 'GNPS_LIBA_LibraryName', 'GNPS_LIBA_MQScore', 'GNPS_LIBA_Organism', 'GNPS_LIBA_TIC_Query', 'GNPS_LIBA_RT_Query', 'GNPS_LIBA_MZErrorPPM', 'GNPS_LIBA_SharedPeaks', 'GNPS_LIBA_MassDiff', 'GNPS_LIBA_LibMZ', 'GNPS_LIBA_SpecMZ', 'GNPS_LIBA_SpecCharge', 'GNPS_LIBA_FileScanUniqueID', 'GNPS_LIBA_NumberHits', 'GNPS_LIBA_tags', 'GNPS_LIBA_MoleculeExplorerDatasets', 'GNPS_LIBA_MoleculeExplorerFiles', 'GNPS_LIBA_InChIKey', 'GNPS_LIBA_InChIKey-Planar', 'GNPS_LIBA_superclass', 'GNPS_LIBA_class', 'GN

## Select the columns and convert/consolidate

### For GNPS library matches

In [5]:
consolidate_and_convert_structures(df,'GNPS_LIB_', smiles='GNPS_LIB_Smiles', \
                                   inchi='GNPS_LIB_INCHI')

Both SMILES and InChI were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 5038
Exception to the parsing: 0
Not available: 47458
Converting INCHI to mol object
Succesfully converted to mol object: 5186
Exception to the parsing: 0
Not available: 47310
Consolidating the lists
Total mol object from the list 1 = 5037
Mol object consolidated from list 2 = 197
Consolidated structures = 5234
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


### For GNPS library match in analogue mode

In [6]:
consolidate_and_convert_structures(df,'GNPS_LIBA_', smiles='GNPS_LIBA_Smiles', \
                                   inchi='GNPS_LIBA_INCHI')

Both SMILES and InChI were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 12466
Exception to the parsing: 0
Not available: 40030
Converting INCHI to mol object
Succesfully converted to mol object: 13149
Exception to the parsing: 0
Not available: 39347
Consolidating the lists
Total mol object from the list 1 = 12464
Mol object consolidated from list 2 = 825
Consolidated structures = 13289
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


### For DEREPLICATOR

In [7]:
consolidate_and_convert_structures(df,'DEREP_', smiles='DEREP_SMILES')

Only SMILES were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 341
Exception to the parsing: 0
Not available: 52155
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


### For DEREPLICATOR+

In [8]:
consolidate_and_convert_structures(df,'DEREP+_', smiles='DEREP+_SMILES')

Only SMILES were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 10127
Exception to the parsing: 0
Not available: 42369
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


### Export table

In [9]:
df.to_csv(path[:-4]+'_consolidated.tsv', sep='\t', index=True)