# EMP Metabolomics feature metadata preparation

Author: Louis Felix Nothias, UC San Diego 

Date: 2021/01/28

### Objectifs:
Prepare and concatenate feature metadata/annotations for either Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)
 
### 
Notes: Some jobs are missing. Need to add them when available.

In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [57]:
# GNPS cluster info summary file
gnps_table = pd.read_table('annotations_2_1/FBMN/FBMN_clusterinfo_sum_0f772203276e48c2b70617f93432e90c.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(57339, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,857.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,CCMSLIB00003358818,66873040.0,802,11722,,802,278.1904,1,278.1904,66873040.0
2,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,CCMSLIB00000853048,35400630.0,794,1256,,794,415.2114,1,415.2114,35400630.0


In [58]:
#GNPS spectral library match
annotation_table = pd.read_table('annotations_2_1/FBMN/FBMN_DBresults_89bbd5a3bf2e47b4b1d2585a45273089.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(7469, 45)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass,GNPS_LIB_npclassifier_superclass,GNPS_LIB_npclassifier_class,GNPS_LIB_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003358818,Amitriptyline,ESI,HCD,Isolated,NIST,NIST,M+H,278.19,0.0,...,138,4708,KRMDCWKBEZIMAB-UHFFFAOYSA-N,KRMDCWKBEZIMAB,,,,Aporphine alkaloids,Aporphine alkaloids,Alkaloids
100,CCMSLIB00000507637,13E-Docosenamide,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M+H,338.342,0.0,...,0,0,,,,,,,,


In [59]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('annotations_2_1/FBMN/FBMN_analogue_resultDB_analogue7edaeec705084cb48823e73e3e5a30a1.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(19117, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003358818,Amitriptyline,ESI,HCD,Isolated,NIST,NIST,M+H,278.19,0.0,...,138,4708,KRMDCWKBEZIMAB-UHFFFAOYSA-N,KRMDCWKBEZIMAB,,,,Aporphine alkaloids,Aporphine alkaloids,Alkaloids
100,CCMSLIB00003721899,cis-13-Docosenoic acid,ESI,HCD,Isolated,NIST,NIST,M+H-H2O,321.315,0.0,...,98,5771,DPUOLQHDNGRHBS-UHFFFAOYSA-N,DPUOLQHDNGRHBS,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Unsaturated fatty acids,Unsaturated fatty acids,Fatty acids


In [60]:
#DEREPLICATOR
annotation_dereplicator = pd.read_table('annotations_2_1/FBMN/DEREPLICATOR-1fafd4d4-view_significant-main.tsv')
annotation_dereplicator = annotation_dereplicator[['Scan','Name','Score','P-Value','SpectrumMass','FDR','Adduct','SMILES']]
annotation_dereplicator = annotation_dereplicator.set_index(['Scan'])
new_names = [(i,'DEREP_'+i) for i in annotation_dereplicator.iloc[:, 0:].columns.values]
annotation_dereplicator.rename(columns = dict(new_names), inplace=True)
print(annotation_dereplicator.shape)
annotation_dereplicator.sort_index().head(2)

(356, 7)


Unnamed: 0_level_0,DEREP_Name,DEREP_Score,DEREP_P-Value,DEREP_SpectrumMass,DEREP_FDR,DEREP_Adduct,DEREP_SMILES
Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1793,"Laxaphycin_B_32-Epimer,_53-deoxy",6,1.8e-07,681.412,1.72,M+2H,CCCCCCCC1CC(=O)NC(C(C)C)C(=O)NC(C(C(C)C)O)C(=O...
2132,Anacyclamide_A10_Anacyclamide_A10,7,1.2e-09,579.352,3.23,M+2H,CCC(C)C1C(=O)NC(Cc2c[nH]c3ccccc23)C(=O)NCC(=O)...


In [61]:
#DEREPLICATOR_PLUS #
annotation_dereplicator_plus = pd.read_table('annotations_2_1/FBMN/DEREPLICATOR_PLUS-ee40831b-view_significant-main.tsv')
annotation_dereplicator_plus = annotation_dereplicator_plus[['LocalSpecIdx','Scan','SpectrumMass',\
                                                                   'Name','Score','FDR','Adduct','SMILES']]
annotation_dereplicator_plus = annotation_dereplicator_plus.set_index(['Scan'])
new_names = [(i,'DEREP+_'+i) for i in annotation_dereplicator_plus.iloc[:, 0:].columns.values]
annotation_dereplicator_plus.rename(columns = dict(new_names), inplace=True)
print(annotation_dereplicator_plus.shape)
annotation_dereplicator_plus.sort_index().head(2)

(11130, 7)


Unnamed: 0_level_0,DEREP+_LocalSpecIdx,DEREP+_SpectrumMass,DEREP+_Name,DEREP+_Score,DEREP+_FDR,DEREP+_Adduct,DEREP+_SMILES
Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
45,39,219.174,"4,7,11-Cadinatrien-3-ol_4,7,11-Cadinatrien-3-ol",14,0.0,M+H,C=C(C)C1=CCC(C)C2CC(O)C(C)=CC12
66,57,219.174,"6,8,10,12-Pentadecatetraenal,_9CI_6,8,10,12-Pe...",13,0.0,M+H,CC/C=C/C=C/C=C/C=C/CCCCC=O


In [62]:
#CYCLONOVO ## No SCANS available in the results so can't be mapped. Issue opened on GitHub.

In [63]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('annotations_2_1/FBMN/SIRIUS_formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared_name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
sirius_MF.head(2)

Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40952,C20H30O6,[M + H]+,1.0,365.748,56,0.94,30659_1907_EMPv2_SIRIUS_40952,-2.492,0.996
28915,C16H28O5,[M - H2O + H]+,1.0,359.185,60,1.0,24533_1907_EMPv2_SIRIUS_28915,-63598.732,1.496


In [64]:
#CSI_FINGERID
sirius_CSI = pd.read_table('annotations_2_1/FBMN/SIRIUS_compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
sirius_CSI.head(2)

Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
49388,4,4,,-154.104,0.728,64.284,C16H16N2O3,[M + H3N + H]+,BUGCHAIWUSBYIZ,InChI=1S/C16H16N2O3/c1-11(19)17-13-3-7-15(8-4-...,Oprea1_347362,CC(=O)NC1=CC=C(C=C1)OC2=CC=C(C=C2)NC(=O)C,2.414,95125,MeSH:(95125);PubChem:(95125);PubMed,70,302.15,139.912,39129_1907_EMPv2_SIRIUS_49388
3585,1,1,,-24.063,1.0,143.896,C19H38O4,[M - H2O + H]+,QHZLMUACJMDIAE,InChI=1S/C19H38O4/c1-2-3-4-5-6-7-8-9-10-11-12-...,Monopalmitin,CCCCCCCCCCCCCCCC(=O)OCC(CO)O,5.566,14900;3084463;11359451;54229153;57694152,HMDB:(11564);PubChem class - food;SuperNatural...,25991964798,313.274,469.949,21705_1907_EMPv2_SIRIUS_3585


In [65]:
#CANOPUS
sirius_CAN = pd.read_table('annotations_2_1/FBMN/SIRIUS_canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
sirius_CAN.head(2)

Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
49388,39129_1907_EMPv2_SIRIUS_49388,C16H19N3O3,[M + H]+,Dipeptides,Peptides,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,Organic compounds; Organoheterocyclic compound...
3585,21705_1907_EMPv2_SIRIUS_3585,C19H38O4,[M - H2O + H]+,1-monoacylglycerols,Monoacylglycerols,Monoradylglycerols,Glycerolipids,Lipids and lipid-like molecules,Organic compounds; Lipids and lipid-like molec...


In [66]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue, 
                                     annotation_dereplicator, annotation_dereplicator_plus, sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('annotations_2_1/FBMN_metabo_feature_metadata.tsv', sep='\t', index=False)
print(master_annotation_table.shape)
master_annotation_table.head(5)

(57339, 171)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,1,,,857.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38355_1907_EMPv2_SIRIUS_1,38355_1907_EMPv2_SIRIUS_1,C20H23N,[M + H]+,Benzenoids,,,,Benzenoids,Organic compounds; Organonitrogen compounds; T...
1,2,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,43929_1907_EMPv2_SIRIUS_2,43929_1907_EMPv2_SIRIUS_2,C24H34O8,[M - H4O2 + H]+,Dibenzylbutane lignans,,,Dibenzylbutane lignans,"Lignans, neolignans and related compounds",Organic compounds; Organoheterocyclic compound...
2,3,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,34129_1907_EMPv2_SIRIUS_3,34129_1907_EMPv2_SIRIUS_3,C24H30O6,[M + Na]+,Benzoic acid esters,Benzoic acid esters,Benzoic acids and derivatives,Benzene and substituted derivatives,Benzenoids,Organic compounds; Lipids and lipid-like molec...
3,4,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,17051_1907_EMPv2_SIRIUS_4,17051_1907_EMPv2_SIRIUS_4,C26H34O6,[M + H3N + H]+,Dialkyl ethers,Dialkyl ethers,Ethers,Organooxygen compounds,Organic oxygen compounds,Organic compounds; Organoheterocyclic compound...
4,5,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,10528_1907_EMPv2_SIRIUS_5,10528_1907_EMPv2_SIRIUS_5,C9H12O,[M - H2O + H]+,Monoterpenoids,,Monoterpenoids,Prenol lipids,Lipids and lipid-like molecules,Organic compounds; Prenol lipids; Organooxygen...


### Export a table with only features that passed the filters

In [67]:
# Features passing the filters
FBMN_passing_filters = pd.read_csv('annotations_2_1/FBMN/emp500_lcms_fbmn_biom_noControls_controlFiltered_features.txt',  sep=',', index_col=0, header=0)
print('Featues initially present in the FBMN table: '+ str(master_annotation_table.shape))

FBMN_for_filter = master_annotation_table.set_index(['#featureID'])
FBMN_filtered = FBMN_for_filter[FBMN_for_filter.index.isin(FBMN_passing_filters.index)]
print('Features remaining after filtering the FBMN table: '+ str(FBMN_filtered.shape[0]))

FBMN_filtered.to_csv('annotations_2_1/FBMN_metabo_feature_metadata_filtered.tsv', sep='\t', index=True)

Featues initially present in the FBMN table: (57339, 171)
Features remaining after filtering the FBMN table: 52496


In [68]:
list(master_annotation_table.columns[:10])

['#featureID',
 'GNPS_Annotated Adduct Features ID',
 'GNPS_Best Ion',
 'GNPS_Correlated Features Group ID',
 'GNPS_G1',
 'GNPS_G2',
 'GNPS_G3',
 'GNPS_G4',
 'GNPS_G5',
 'GNPS_G6']

# CLASSICAL MOLECULAR NETWORKING (CMN)

In [69]:
# GNPS clusterinfosummary file
gnps_CMN_table = pd.read_table('annotations_2_1/CMN/CMN_39fabe4366ae48788f52bc6dec68feb9.clustersummary')

# Drop group columns
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('ATTRIBUTE')]
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('GNPSGROUP')]
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('AllGroups')]
gnps_CMN_table = gnps_CMN_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_CMN_table.iloc[:, 0:].columns.values]
gnps_CMN_table.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_table.shape)
gnps_CMN_table.head(2)

(39235, 27)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,GNPS_Smiles,GNPS_SpectrumID,GNPS_UniqueFileSources,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"G1,G2,G3,G4,G5",0,1,1,3,1,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,Resusp_solvent_20190207170708.mzML|7F10_8_60_s...,8,-1,8,103.955,0,103.955,1507110.0
3,"G3,G4,G5",0,0,0,5,2,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,7B5_blank.mzML|5C10_3_57_sandin-54-s002-a04.mz...,9,-1,9,105.954,0,105.954,736212.0


In [70]:
#GNPS spectral library match
gnps_CMN_annotation = pd.read_table('annotations_2_1/CMN/CMN_resultDB_035d11d56ce0464b8b8f6b11e8728f6c.tsv', index_col=False)
gnps_CMN_annotation['cluster index'] = gnps_CMN_annotation['#Scan#']
gnps_CMN_annotation = gnps_CMN_annotation.set_index(['cluster index'])
new_names = [(i,'GNPS_LIB_'+i) for i in gnps_CMN_annotation.iloc[:, 0:].columns.values]
gnps_CMN_annotation.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_annotation.shape)
gnps_CMN_annotation.head(2)

(3131, 46)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass,GNPS_LIB_npclassifier_superclass,GNPS_LIB_npclassifier_class,GNPS_LIB_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100010,CCMSLIB00005722082,"NCGC00385642-01!1,4-dihydroxyheptadec-16-en-2-...",LC-ESI,Orbitrap,Commercial,Pieter Dorrestein,lfnothias/robinschmid,[M-H2O+H]+,311.258,328.261,...,0,0,LUIGTZGBXWZJAX-UHFFFAOYSA-N,LUIGTZGBXWZJAX,Lipids and lipid-like molecules,Fatty Acyls,Fatty alcohols,Fatty alcohols,Fatty alcohols,Fatty acids
100058,CCMSLIB00003302690,Monopalmitolein (9c),ESI,HCD,Isolated,NIST,NIST,M+H-H2O,311.258,0.0,...,67,3823,KVYUBFKSKZWZSV-UHFFFAOYSA-N,KVYUBFKSKZWZSV,Lipids and lipid-like molecules,Glycerolipids,Monoradylglycerols,Monoacylglycerols,Monoacylglycerols,Fatty acids


In [71]:
#GNPS spectral library match
gnps_CMN_annotation_analogue = pd.read_table('annotations_2_1/CMN/CMN_analogue_resultDB_8e3e3655a06042a29950600274569a01.tsv', index_col=False)
gnps_CMN_annotation_analogue['cluster index'] = gnps_CMN_annotation_analogue['#Scan#']
gnps_CMN_annotation_analogue = gnps_CMN_annotation_analogue.set_index(['cluster index'])
new_names = [(i,'GNPS_LIBA_'+i) for i in gnps_CMN_annotation_analogue.iloc[:, 0:].columns.values]
gnps_CMN_annotation_analogue.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_annotation_analogue.shape)
gnps_CMN_annotation_analogue.head(2)

(11164, 46)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100010,CCMSLIB00003421948,(-)-Caryophyllene oxide,ESI,HCD,Isolated,NIST,NIST,M+H,221.19,0.0,...,46,1143,NVEQFIOZRFFVFW-UHFFFAOYSA-N,NVEQFIOZRFFVFW,Lipids and lipid-like molecules,Prenol lipids,Sesquiterpenoids,Caryophyllane sesquiterpenoids,Caryophyllane sesquiterpenoids,Terpenoids
100058,CCMSLIB00003400500,Muscone,ESI,HCD,Isolated,NIST,NIST,M+H,239.237,0.0,...,44,3114,ALHUZKCOMYUFRB-UHFFFAOYSA-N,ALHUZKCOMYUFRB,Organic oxygen compounds,Organooxygen compounds,Carbonyl compounds,Lactones,Lactones,Fatty acids


In [72]:
#DEREPLICATOR
annotation_CMN_dereplicator = pd.read_table('annotations_2_1/CMN/DEREPLICATOR-1457a3f9-view_significant-main.tsv')
annotation_CMN_dereplicator = annotation_CMN_dereplicator[['LocalSpecIdx','Scan','SpectrumMass',
                                                            'Name','Score','P-Value','FDR','Adduct','SMILES']]

annotation_CMN_dereplicator.rename(columns={'Scan':'cluster index'}, inplace=True)
annotation_CMN_dereplicator = annotation_CMN_dereplicator.set_index(['cluster index'])

new_names = [(i,'DEREP_'+i) for i in annotation_CMN_dereplicator.iloc[:, 0:].columns.values]
annotation_CMN_dereplicator.rename(columns = dict(new_names), inplace=True)
annotation_CMN_dereplicator.sort_index().tail(2)

Unnamed: 0_level_0,DEREP_LocalSpecIdx,DEREP_SpectrumMass,DEREP_Name,DEREP_Score,DEREP_P-Value,DEREP_FDR,DEREP_Adduct,DEREP_SMILES
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
399658,39222,1202.85,"Cyclosporin,_9CI_2-[O-(2-Hydroxyethyl)-D-serine]",13,3.3e-23,0.0,M+H,CC=CCC(C)C(C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(C...
399673,39224,1219.88,"Cyclosporin,_9CI_Cyclosporin_L",6,7.1e-13,0.0,M+H,CC=CCC(C)C(C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(C...


In [73]:
#DEREPLICATORPLUS
annotation_CMN_dereplicatorplus = pd.read_table('annotations_2_1/CMN/DEREPLICATOR_PLUS-532ea29a-view_significant-main.tsv')
annotation_CMN_dereplicatorplus = annotation_CMN_dereplicatorplus[['LocalSpecIdx','Scan','SpectrumMass',\
                                                                   'Name','Score','FDR','Adduct','SMILES']]

annotation_CMN_dereplicatorplus.rename(columns={'Scan':'cluster index'}, inplace=True)
annotation_CMN_dereplicatorplus = annotation_CMN_dereplicatorplus.set_index(['cluster index'])

new_names = [(i,'DEREP+_'+i) for i in annotation_CMN_dereplicatorplus.iloc[:, 0:].columns.values]
annotation_CMN_dereplicatorplus.rename(columns = dict(new_names), inplace=True)
annotation_CMN_dereplicatorplus.sort_index().head(2)


Unnamed: 0_level_0,DEREP+_LocalSpecIdx,DEREP+_SpectrumMass,DEREP+_Name,DEREP+_Score,DEREP+_FDR,DEREP+_Adduct,DEREP+_SMILES
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
26128,3288,224.201,Nigramide_O_Nigramide_O,13,0.0,M+2H,CCCCCC1C=CC(C(=O)NCC(C)C)C(CCCCC)C1/C=C/C(=O)N...
34650,4226,235.229,"Arenosclerin_A_Deoxy,_23,24-dihydro",19,0.0,M+2H,C1=C/CCN2CCC3C(CCCCCC/C=C/CCCCN4CC(CCCCCC/1)CC...


In [74]:
#SIRIUS MOLECULAR_FORMULA
CMN_sirius_MF = pd.read_table('annotations_2_1/CMN/CMN_SIRIUS_formula_identifications_MF_network.txt')
CMN_sirius_MF['shared name'] = CMN_sirius_MF['shared_name'].str.replace("ScanNumber"," ")
CMN_sirius_MF= CMN_sirius_MF.set_index(['shared name'])
new_names = [(i,'SIR_MF_'+i) for i in CMN_sirius_MF.iloc[:, 0:].columns.values]
CMN_sirius_MF.rename(columns = dict(new_names), inplace=True)
CMN_sirius_MF.head(2)

Unnamed: 0_level_0,SIR_MF_shared_name,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
143039,ScanNumber143039,C18H34Cl2O4,[M - H4O2 + H]+,1.0,118.843,17,0.997,4356_EMP-CMN-clustered-b6e15e30aea24ded9a41337...,-103160.868,1.639
133278,ScanNumber133278,C21H40O5,[M - H4O2 + H]+,1.0,113.225,16,0.992,8173_EMP-CMN-clustered-b6e15e30aea24ded9a41337...,-106799.996,1.681


In [75]:
#CSI_FINGERID
CMN_sirius_CSI = pd.read_table('annotations_2_1/CMN/CMN_SIRIUS_compound_identifications_adducts_CSIFingerID_network.txt')
CMN_sirius_CSI['shared name'] = CMN_sirius_CSI['shared name'].str.replace("ScanNumber"," ")
CMN_sirius_CSI = CMN_sirius_CSI.set_index(['shared name'])
CMN_sirius_CSI.head(2)

Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
67972,3,3,,-75.522,1.0,56.711,C14H20O8,[M - H4O2 + H]+,MTHKUYBZPRWHDL,InChI=1S/C14H20O8/c1-19-7-4-3-5-8(20-2)13(7)22...,,COC1=C(C(=CC=C1)OC)OC2C(C(C(C(O2)CO)O)O)O,-0.386,10567394,COCONUT:(CNP0271006);Natural Products:(UNPD119...,3178498,281.102,128.063,2026_EMP-CMN-clustered-b6e15e30aea24ded9a41337...
36778,1,3,,-168.361,0.281,44.006,C7H7NO5S,[M + Na]+,RMNJNEUWTBBZPT,"InChI=1S/C7H7NO5S/c1-13-14(11,12)7-4-2-6(3-5-7...",Methyl 4-nitrobenzenesulfonate,COS(=O)(=O)C1=CC=C(C=C1)[N+](=O)[O-],1.091,22582;12103505;13099715;71309630;101680997;117...,NORMAN:(NS00034933);PubChem:(22582 12103505 13...,67379270,239.995,722.719,28851_EMP-CMN-clustered-b6e15e30aea24ded9a4133...


In [76]:
#CANOPUS
CMN_sirius_CAN = pd.read_table('annotations_2_1/CMN/CMN_SIRIUS_canopus_summary_CANOPUS_network.txt')
CMN_sirius_CAN['shared name'] = CMN_sirius_CAN['shared name'].str.replace("ScanNumber"," ")
CMN_sirius_CAN = CMN_sirius_CAN.set_index(['shared name'])
CMN_sirius_CAN.head(2)

Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
67972,2026_EMP-CMN-clustered-b6e15e30aea24ded9a41337...,C14H16O6,[M + H]+,Anisoles,,Anisoles,Phenol ethers,Benzenoids,Organic compounds; Organoheterocyclic compound...
326813,14905_EMP-CMN-clustered-b6e15e30aea24ded9a4133...,C29H44F3NO4,[M + K]+,Phenoxy compounds,,Phenoxy compounds,Benzene and substituted derivatives,Benzenoids,Organic compounds; Ethers; Organic acids and d...


In [77]:
#Create the master GNPS CMN table 
master_CMN_annotation_table = pd.concat([gnps_CMN_table, gnps_CMN_annotation, gnps_CMN_annotation_analogue], axis=1, sort=False)
print(master_CMN_annotation_table.shape)
master_CMN_annotation_table.tail(2)

(39235, 119)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
400299,G2,0,0,3,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,
400307,G3,1,0,0,3,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,


In [78]:
#Create the master GNPS CMN table 
dereplicator_master = pd.merge(annotation_CMN_dereplicator, annotation_CMN_dereplicatorplus, on="cluster index", how='outer')
dereplicator_master.iloc[5]

DEREP_LocalSpecIdx                                                 35201
DEREP_SpectrumMass                                               596.401
DEREP_Name                              Emericellamides_Emericellamide_C
DEREP_Score                                                           10
DEREP_P-Value                                                      1e-19
DEREP_FDR                                                              0
DEREP_Adduct                                                         M+H
DEREP_SMILES           CCCCCCCC1C(C)C(=O)NCC(=O)NC(C(C)C)C(=O)NC(CC(C...
DEREP+_LocalSpecIdx                                                35201
DEREP+_SpectrumMass                                              596.401
DEREP+_Name                             Emericellamides_Emericellamide_C
DEREP+_Score                                                          22
DEREP+_FDR                                                             0
DEREP+_Adduct                                      

In [79]:
#Create the master SIRIUS CMN table 

master_CMN_SIRIUS_table = pd.concat([CMN_sirius_MF, CMN_sirius_CSI, CMN_sirius_CAN], axis=1, sort=False)
print(master_CMN_SIRIUS_table.shape)
master_CMN_SIRIUS_table.index.name = 'cluster index'
master_CMN_SIRIUS_table.index= master_CMN_SIRIUS_table.index.astype('int') 
master_CMN_SIRIUS_table.sort_values("cluster index", inplace=True)
master_CMN_SIRIUS_table.head(2)

(35456, 38)


Unnamed: 0_level_0,SIR_MF_shared_name,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm),...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,ScanNumber1,FNO2,[M + K]+,0.391,-1.609,1,0.0,31830_EMP-CMN-clustered-b6e15e30aea24ded9a4133...,5.151,,...,,,,,,,,,,
3,ScanNumber3,C2HBS,[M + K]+,0.511,-3.219,1,0.0,5488_EMP-CMN-clustered-b6e15e30aea24ded9a41337...,-18.792,,...,,,,,,,,,,


In [80]:
# Create the intermediate master feature metadata table with GNPS annotation and Dereplicator results
intermediate_master_table = pd.merge(master_CMN_annotation_table, dereplicator_master,
                              on="cluster index", how='outer')
intermediate_master_table.sort_values("cluster index", inplace=True)
print(intermediate_master_table.shape)
intermediate_master_table.head(3)

(39243, 134)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,DEREP_FDR,DEREP_Adduct,DEREP_SMILES,DEREP+_LocalSpecIdx,DEREP+_SpectrumMass,DEREP+_Name,DEREP+_Score,DEREP+_FDR,DEREP+_Adduct,DEREP+_SMILES
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"G1,G2,G3,G4,G5",0,1,1,3,1,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,
3,"G3,G4,G5",0,0,0,5,2,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,
5,"G1,G2,G3,G4,G5,G6",0,24,8,3,1,4,2,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,


In [81]:
# Create the feature metadata table with SIRIUS results
final_master_table = pd.merge(intermediate_master_table.astype(str), master_CMN_SIRIUS_table.astype(str), on="cluster index", how='outer')
print(final_master_table.shape)
#print(list(final_master_table.columns))
final_master_table.head(2)

(39243, 172)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"G1,G2,G3,G4,G5",0,1,1,3,1,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,
3,"G3,G4,G5",0,0,0,5,2,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,


In [82]:
# Save the table
final_master_table.to_csv('annotations_2_1/CMN_metabo_feature_metadata.tsv', sep='\t', index=True)

### Export a table with only features that passed the filters

In [83]:
# Features passing the filters
CMN_passing_filters = pd.read_csv('annotations_2_1/CMN/emp500_lcms_cmn_biom_noControls_controlFiltered_features.txt',  sep=',', index_col=0, header=0)
print('Features initially present in the FBMN table: '+ str(final_master_table.shape))

CMN_filtered = final_master_table[final_master_table.index.isin(CMN_passing_filters.index)]
print('Features remaining after filtering the FBMN table: '+ str(CMN_filtered.shape[0]))

CMN_filtered.to_csv('annotations_2_1/CMN_metabo_feature_metadata_filtered.tsv', sep='\t', index=True)

Features initially present in the FBMN table: (39243, 172)
Features remaining after filtering the FBMN table: 37590
