# EMP Metabolomics feature metadata preparation

Author: Louis Felix Nothias, UC San Diego 

Date: 2021/01/28

### Objectifs:
Prepare and concatenate feature metadata/annotations for either Feature-Based Molecular Networking (FBMN) or Classical Molecular Networking (CMN)
 
### 
Notes: Some jobs are missing. Need to add them when available.

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## FEATURE BASED MOLECULAR NETWORKING

In [49]:
# GNPS cluster info summary file
gnps_table = pd.read_table('annotations_2_1/FBMN/FBMN_clusterinfo_sum_0f772203276e48c2b70617f93432e90c.tsv')
# Drop group columns
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('ATTRIBUTE')]
gnps_table = gnps_table.loc[:,~gnps_table.columns.str.startswith('GNPSGROUP')]
gnps_table = gnps_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_table.iloc[:, 0:].columns.values]
gnps_table.rename(columns = dict(new_names), inplace=True)
print(gnps_table.shape)
gnps_table.head(2)

(57339, 29)


Unnamed: 0_level_0,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,...,GNPS_SpectrumID,GNPS_SumPeakIntensity,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_neutral M mass,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,857.0,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,CCMSLIB00003358818,66873040.0,802,11722,,802,278.1904,1,278.1904,66873040.0
2,,,,0.0,0.0,0.0,0.0,0.0,0.0,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?ta...,...,CCMSLIB00000853048,35400630.0,794,1256,,794,415.2114,1,415.2114,35400630.0


In [50]:
#GNPS spectral library match
annotation_table = pd.read_table('annotations_2_1/FBMN/FBMN_DBresults_89bbd5a3bf2e47b4b1d2585a45273089.tsv', index_col=False)
annotation_table = annotation_table.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIB_'+i) for i in annotation_table.iloc[:, 0:].columns.values]
annotation_table.rename(columns = dict(new_names), inplace=True)
print(annotation_table.shape)
annotation_table.head(2)

(7469, 45)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass,GNPS_LIB_npclassifier_superclass,GNPS_LIB_npclassifier_class,GNPS_LIB_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003358818,Amitriptyline,ESI,HCD,Isolated,NIST,NIST,M+H,278.19,0.0,...,138,4708,KRMDCWKBEZIMAB-UHFFFAOYSA-N,KRMDCWKBEZIMAB,,,,Aporphine alkaloids,Aporphine alkaloids,Alkaloids
100,CCMSLIB00000507637,13E-Docosenamide,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M+H,338.342,0.0,...,0,0,,,,,,,,


In [51]:
#GNPS spectral library match analogue 
annotation_table_analogue = pd.read_table('annotations_2_1/FBMN/FBMN_analogue_resultDB_analogue7edaeec705084cb48823e73e3e5a30a1.tsv', index_col=False)
annotation_table_analogue = annotation_table_analogue.set_index(['#Scan#'])
new_names = [(i,'GNPS_LIBA_'+i) for i in annotation_table_analogue.iloc[:, 0:].columns.values]
annotation_table_analogue.rename(columns = dict(new_names), inplace=True)
print(annotation_table_analogue.shape)
annotation_table_analogue.head(2)

(19117, 45)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
#Scan#,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,CCMSLIB00003358818,Amitriptyline,ESI,HCD,Isolated,NIST,NIST,M+H,278.19,0.0,...,138,4708,KRMDCWKBEZIMAB-UHFFFAOYSA-N,KRMDCWKBEZIMAB,,,,Aporphine alkaloids,Aporphine alkaloids,Alkaloids
100,CCMSLIB00003721899,cis-13-Docosenoic acid,ESI,HCD,Isolated,NIST,NIST,M+H-H2O,321.315,0.0,...,98,5771,DPUOLQHDNGRHBS-UHFFFAOYSA-N,DPUOLQHDNGRHBS,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Unsaturated fatty acids,Unsaturated fatty acids,Fatty acids


In [52]:
#DEREPLICATOR
annotation_dereplicator = pd.read_table('annotations_2_1/FBMN/DEREPLICATOR-1fafd4d4-view_significant-main.tsv')
annotation_dereplicator = annotation_dereplicator[['Scan','Name','Score','P-Value','SpectrumMass','FDR','Adduct','SMILES']]
annotation_dereplicator = annotation_dereplicator.set_index(['Scan'])
new_names = [(i,'DEREP_'+i) for i in annotation_dereplicator.iloc[:, 0:].columns.values]
annotation_dereplicator.rename(columns = dict(new_names), inplace=True)
annotation_dereplicator.sort_index().head(2)

Unnamed: 0_level_0,DEREP_Name,DEREP_Score,DEREP_P-Value,DEREP_SpectrumMass,DEREP_FDR,DEREP_Adduct,DEREP_SMILES
Scan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1793,"Laxaphycin_B_32-Epimer,_53-deoxy",6,1.8e-07,681.412,1.72,M+2H,CCCCCCCC1CC(=O)NC(C(C)C)C(=O)NC(C(C(C)C)O)C(=O...
2132,Anacyclamide_A10_Anacyclamide_A10,7,1.2e-09,579.352,3.23,M+2H,CCC(C)C1C(=O)NC(Cc2c[nH]c3ccccc23)C(=O)NCC(=O)...


In [23]:
#DEREPLICATOR_PLUS ## There is a bug presently

In [24]:
#CYCLONOVO ## No SCANS available in the results

In [53]:
#SIRIUS MOLECULAR_FORMULA
sirius_MF = pd.read_table('annotations_2_1/FBMN/SIRIUS_formula_identifications_MF_network.txt')
sirius_MF= sirius_MF.set_index(['shared_name'])
new_names = [(i,'SIR_MF_'+i) for i in sirius_MF.iloc[:, 0:].columns.values]
sirius_MF.rename(columns = dict(new_names), inplace=True)
sirius_MF.head(2)

Unnamed: 0_level_0,SIR_MF_Zod_molecularFormula,SIR_MF_Zod_adduct,SIR_MF_Zod_ZodiacScore,SIR_MF_Zod_TreeScore,SIR_MF_Zod_numExplainedPeaks,SIR_MF_Zod_explainedIntensity,SIR_MF_Zod_id,SIR_MF_Zod_massErrorPrecursor(ppm),SIR_MF_Zod_medianAbsoluteMassErrorFragmentPeaks(ppm)
shared_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40952,C20H30O6,[M + H]+,1.0,365.748,56,0.94,30659_1907_EMPv2_SIRIUS_40952,-2.492,0.996
28915,C16H28O5,[M - H2O + H]+,1.0,359.185,60,1.0,24533_1907_EMPv2_SIRIUS_28915,-63598.732,1.496


In [54]:
#CSI_FINGERID
sirius_CSI = pd.read_table('annotations_2_1/FBMN/SIRIUS_compound_identifications_adducts_CSIFingerID_network.txt')
sirius_CSI = sirius_CSI.set_index(['shared name'])
sirius_CSI.head(2)

Unnamed: 0_level_0,CSI_#adducts,CSI_#predictedFPs,CSI_ConfidenceScore,CSI_CSI:FingerIDScore,CSI_ZodiacScore,CSI_SiriusScore,CSI_molecularFormula,CSI_adduct,CSI_InChIkey2D,CSI_InChI,CSI_name,CSI_smiles,CSI_xlogp,CSI_pubchemids,CSI_links,CSI_dbflags,CSI_ionMass,CSI_retentionTimeInSeconds,CSI_id
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
31,4,5,,-155.724,0.909,25.632,C19H20O6,[M + H3N + H]+,OEKOYDNDUSEWAD,InChI=1S/C19H20O6/c1-22-15-7-5-6-13(20)18(15)1...,,COC1=CC=CC(=C1C(=O)C=CC2=CC(=C(C(=C2)OC)OC)OC)O,3.8,24721391;53405584,COCONUT:(CNP0153998);PubChem:(24721391 5340558...,3694594,362.159,314.0,30_140221_31
1078,1,1,,-139.338,1.0,35.636,C3H5O3P,[M + K]+,ISLWZBTUAHMAND,InChI=1S/C3H6O3P/c4-7-5-2-1-3-6-7/h1-3H2/q+1,"1,3,2-Dioxaphosphorinane-2-oxide",C1CO[P+](=O)OC1,0.1,6337098,MeSH:(6337098);PubChem:(6337098);PubMed,70,158.961,2671.382,1077_140221_1078


In [55]:
#CANOPUS
sirius_CAN = pd.read_table('annotations_2_1/FBMN/SIRIUS_canopus_summary_CANOPUS_network.txt')
sirius_CAN = sirius_CAN.set_index(['shared name'])
sirius_CAN.head(2)

Unnamed: 0_level_0,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
shared name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
49388,39129_1907_EMPv2_SIRIUS_49388,C16H19N3O3,[M + H]+,Dipeptides,Peptides,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives,Organic acids and derivatives,Organic compounds; Organoheterocyclic compound...
3585,21705_1907_EMPv2_SIRIUS_3585,C19H38O4,[M - H2O + H]+,1-monoacylglycerols,Monoacylglycerols,Monoradylglycerols,Glycerolipids,Lipids and lipid-like molecules,Organic compounds; Lipids and lipid-like molec...


In [56]:
#Create the master table
master_annotation_table = pd.concat([gnps_table, annotation_table, annotation_table_analogue, 
                                     annotation_dereplicator, sirius_MF, sirius_CSI, sirius_CAN], axis=1, sort=False)

master_annotation_table = master_annotation_table.reset_index(drop=False)
master_annotation_table.rename(columns={'index':'#featureID'}, inplace=True)
master_annotation_table.to_csv('annotations_2_1/FBMN_metabo_feature_metadata.tsv', sep='\t', index=False)
print(master_annotation_table.shape)
master_annotation_table.head(5)

(57339, 164)


Unnamed: 0,#featureID,GNPS_Annotated Adduct Features ID,GNPS_Best Ion,GNPS_Correlated Features Group ID,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,...,CSI_id,CAN_name,CAN_molecularFormula,CAN_adduct,CAN_most specific class,CAN_level 5,CAN_subclass,CAN_class,CAN_superclass,CAN_all classifications
0,1,,,857.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0_140221_1,38355_1907_EMPv2_SIRIUS_1,C20H23N,[M + H]+,Benzenoids,,,,Benzenoids,Organic compounds; Organonitrogen compounds; T...
1,2,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,43929_1907_EMPv2_SIRIUS_2,C24H34O8,[M - H4O2 + H]+,Dibenzylbutane lignans,,,Dibenzylbutane lignans,"Lignans, neolignans and related compounds",Organic compounds; Organoheterocyclic compound...
2,3,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,2_140221_3,34129_1907_EMPv2_SIRIUS_3,C24H30O6,[M + Na]+,Benzoic acid esters,Benzoic acid esters,Benzoic acids and derivatives,Benzene and substituted derivatives,Benzenoids,Organic compounds; Lipids and lipid-like molec...
3,4,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,,17051_1907_EMPv2_SIRIUS_4,C26H34O6,[M + H3N + H]+,Dialkyl ethers,Dialkyl ethers,Ethers,Organooxygen compounds,Organic oxygen compounds,Organic compounds; Organoheterocyclic compound...
4,5,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,4_140221_5,10528_1907_EMPv2_SIRIUS_5,C9H12O,[M - H2O + H]+,Monoterpenoids,,Monoterpenoids,Prenol lipids,Lipids and lipid-like molecules,Organic compounds; Prenol lipids; Organooxygen...


In [47]:
#list(master_annotation_table.columns)

# CLASSICAL MOLECULAR NETWORKING (CMN)

In [30]:
# GNPS clusterinfosummary file
gnps_CMN_table = pd.read_table('annotations_2_1/CMN/CMN_55a43e151ee4481d8098b9bab9886fc7.clustersummary')

# Drop group columns
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('ATTRIBUTE')]
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('GNPSGROUP')]
gnps_CMN_table = gnps_CMN_table.loc[:,~gnps_CMN_table.columns.str.startswith('AllGroups')]
gnps_CMN_table = gnps_CMN_table.set_index(['cluster index'])
new_names = [(i,'GNPS_'+i) for i in gnps_CMN_table.iloc[:, 0:].columns.values]
gnps_CMN_table.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_table.shape)
gnps_CMN_table.head(2)

(26945, 27)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,GNPS_Smiles,GNPS_SpectrumID,GNPS_UniqueFileSources,GNPS_UniqueFileSourcesCount,GNPS_componentindex,GNPS_number of spectra,GNPS_parent mass,GNPS_precursor charge,GNPS_precursor mass,GNPS_sum(precursor intensity)
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"G1,G2,G3,G4,G5",0,1,1,3,1,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,5H3_4_57_mousseau-88-s065-a02.mzML|7F10_8_60_s...,8,-1,8,103.955,0,103.955,1507110.0
3,"G3,G4,G5",0,0,0,5,2,2,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,5B2_2_66_minich-76-s002-a04.mzML|6B11_5_10_mou...,9,-1,9,105.954,0,105.954,736212.0


In [31]:
#GNPS spectral library match
gnps_CMN_annotation = pd.read_table('annotations_2_1/CMN/CMN_result_specnets_DB_6b96b833bf474ac9b6f510b6de29da3d.tsv', index_col=False)
gnps_CMN_annotation['cluster index'] = gnps_CMN_annotation['#Scan#']
gnps_CMN_annotation = gnps_CMN_annotation.set_index(['cluster index'])
new_names = [(i,'GNPS_LIB_'+i) for i in gnps_CMN_annotation.iloc[:, 0:].columns.values]
gnps_CMN_annotation.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_annotation.shape)
gnps_CMN_annotation.head(2)

(1253, 46)


Unnamed: 0_level_0,GNPS_LIB_SpectrumID,GNPS_LIB_Compound_Name,GNPS_LIB_Ion_Source,GNPS_LIB_Instrument,GNPS_LIB_Compound_Source,GNPS_LIB_PI,GNPS_LIB_Data_Collector,GNPS_LIB_Adduct,GNPS_LIB_Precursor_MZ,GNPS_LIB_ExactMass,...,GNPS_LIB_MoleculeExplorerDatasets,GNPS_LIB_MoleculeExplorerFiles,GNPS_LIB_InChIKey,GNPS_LIB_InChIKey-Planar,GNPS_LIB_superclass,GNPS_LIB_class,GNPS_LIB_subclass,GNPS_LIB_npclassifier_superclass,GNPS_LIB_npclassifier_class,GNPS_LIB_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,CCMSLIB00003135379,Spectral Match to Monopalmitolein (9c) from NI...,ESI,HCD,Isolated,Data from Aaron Puri,Data deposited by daniel,M+H-H2O,311.257,0.0,...,94,4177,,,,,,,,
100007,CCMSLIB00005722082,"NCGC00385642-01!1,4-dihydroxyheptadec-16-en-2-...",LC-ESI,Orbitrap,Commercial,Pieter Dorrestein,lfnothias/robinschmid,[M-H2O+H]+,311.258,328.261,...,0,0,LUIGTZGBXWZJAX-UHFFFAOYSA-N,LUIGTZGBXWZJAX,Lipids and lipid-like molecules,Fatty Acyls,Fatty alcohols,Fatty alcohols,Fatty alcohols,Fatty acids


In [32]:
#GNPS spectral library match
gnps_CMN_annotation_analogue = pd.read_table('annotations_2_1/CMN/CMN_analogue_result_specnets_DB_fe5c5877e71f462a8f4b61127e3de177.tsv', index_col=False)
gnps_CMN_annotation_analogue['cluster index'] = gnps_CMN_annotation_analogue['#Scan#']
gnps_CMN_annotation_analogue = gnps_CMN_annotation_analogue.set_index(['cluster index'])
new_names = [(i,'GNPS_LIBA_'+i) for i in gnps_CMN_annotation_analogue.iloc[:, 0:].columns.values]
gnps_CMN_annotation_analogue.rename(columns = dict(new_names), inplace=True)
print(gnps_CMN_annotation_analogue.shape)
gnps_CMN_annotation_analogue.head(2)

(8856, 46)


Unnamed: 0_level_0,GNPS_LIBA_SpectrumID,GNPS_LIBA_Compound_Name,GNPS_LIBA_Ion_Source,GNPS_LIBA_Instrument,GNPS_LIBA_Compound_Source,GNPS_LIBA_PI,GNPS_LIBA_Data_Collector,GNPS_LIBA_Adduct,GNPS_LIBA_Precursor_MZ,GNPS_LIBA_ExactMass,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,CCMSLIB00003455074,Hexadecanedioic acid,ESI,HCD,Isolated,NIST,NIST,M+H-H2O,269.211,0.0,...,72,2853,QQHJDPROMQRDLA-UHFFFAOYSA-N,QQHJDPROMQRDLA,Lipids and lipid-like molecules,Fatty Acyls,Fatty acids and conjugates,Dicarboxylic acids,Dicarboxylic acids,Fatty acids
100007,CCMSLIB00003256268,"5,6-Dihydroxy-8Z,11Z,14Z-eicosatrienoic acid",ESI,HCD,Isolated,NIST,NIST,M+H,339.253,0.0,...,22,479,GFNYAPAJUNPMGH-UHFFFAOYSA-N,GFNYAPAJUNPMGH,Lipids and lipid-like molecules,Fatty Acyls,Eicosanoids,Hydroxy-hydroperoxyeicosatrienoic acids|Other ...,Hydroxy-hydroperoxyeicosatrienoic acids|Other ...,Fatty acids


In [33]:
#DEREPLICATOR
annotation_CMN_dereplicator = pd.read_table('annotations_2_1/CMN/DEREPLICATOR-75da4c83-view_significant-main.tsv')
annotation_CMN_dereplicator = annotation_CMN_dereplicator[['LocalSpecIdx','Scan','SpectrumMass',
                                                            'Name','Score','P-Value','FDR','Adduct','SMILES']]

annotation_CMN_dereplicator.rename(columns={'Scan':'cluster index'}, inplace=True)
annotation_CMN_dereplicator = annotation_CMN_dereplicator.set_index(['cluster index'])

new_names = [(i,'DEREP_'+i) for i in annotation_CMN_dereplicator.iloc[:, 0:].columns.values]
annotation_CMN_dereplicator.rename(columns = dict(new_names), inplace=True)
annotation_CMN_dereplicator.sort_index().tail(2)

Unnamed: 0_level_0,DEREP_LocalSpecIdx,DEREP_SpectrumMass,DEREP_Name,DEREP_Score,DEREP_P-Value,DEREP_FDR,DEREP_Adduct,DEREP_SMILES
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
399909,26798,952.528,stylissamide_X,10,1.9e-18,0.0,M+H,[C@@H]12N(C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]...
400283,26827,968.523,stylissamide_X,9,4.1e-18,0.0,M+H,[C@@H]12N(C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]...
400290,26828,969.554,stylissamide_X,6,1.9e-12,5.0,M+H,[C@@H]12N(C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]...
400516,26855,985.549,stylissamide_X,9,2.3999999999999997e-19,0.0,M+H,[C@@H]12N(C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]...
402055,26937,1202.85,"Cyclosporin,_9CI_2-[O-(2-Hydroxyethyl)-D-serine]",13,3.3e-23,0.0,M+H,CC=CCC(C)C(C1C(=O)NC(CC)C(=O)N(C)CC(=O)N(C)C(C...


In [34]:
#DEREPLICATORPLUS
annotation_CMN_dereplicatorplus = pd.read_table('annotations_2_1/CMN/DEREPLICATOR_PLUS-31922c2d-view_significant-main.tsv')
annotation_CMN_dereplicatorplus = annotation_CMN_dereplicatorplus[['LocalSpecIdx','Scan','SpectrumMass',\
                                                                   'Name','Score','FDR','Adduct','SMILES']]

annotation_CMN_dereplicatorplus.rename(columns={'Scan':'cluster index'}, inplace=True)
annotation_CMN_dereplicatorplus = annotation_CMN_dereplicatorplus.set_index(['cluster index'])

new_names = [(i,'DEREP+_'+i) for i in annotation_CMN_dereplicatorplus.iloc[:, 0:].columns.values]
annotation_CMN_dereplicatorplus.rename(columns = dict(new_names), inplace=True)
annotation_CMN_dereplicatorplus.sort_index().head(2)


Unnamed: 0_level_0,DEREP+_LocalSpecIdx,DEREP+_SpectrumMass,DEREP+_Name,DEREP+_Score,DEREP+_FDR,DEREP+_Adduct,DEREP+_SMILES
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
26492,2350,224.201,Nigramide_O_Nigramide_O,13,0.0,M+2H,CCCCCC1C=CC(C(=O)NCC(C)C)C(CCCCC)C1/C=C/C(=O)N...
34917,3023,235.229,"Arenosclerin_A_Deoxy,_23,24-dihydro",19,0.0,M+2H,C1=C/CCN2CCC3C(CCCCCC/C=C/CCCCN4CC(CCCCCC/1)CC...


In [35]:
#Create the master GNPS CMN table 
master_CMN_annotation_table = pd.concat([gnps_CMN_table, gnps_CMN_annotation, gnps_CMN_annotation_analogue], axis=1, sort=False)
print(master_CMN_annotation_table.shape)
master_CMN_annotation_table.tail(2)

(26945, 119)


Unnamed: 0_level_0,GNPS_DefaultGroups,GNPS_EvenOdd,GNPS_G1,GNPS_G2,GNPS_G3,GNPS_G4,GNPS_G5,GNPS_G6,GNPS_GNPSLinkout_Cluster,GNPS_GNPSLinkout_Network,...,GNPS_LIBA_MoleculeExplorerDatasets,GNPS_LIBA_MoleculeExplorerFiles,GNPS_LIBA_InChIKey,GNPS_LIBA_InChIKey-Planar,GNPS_LIBA_superclass,GNPS_LIBA_class,GNPS_LIBA_subclass,GNPS_LIBA_npclassifier_superclass,GNPS_LIBA_npclassifier_class,GNPS_LIBA_npclassifier_pathway
cluster index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
402696,G2,0,0,3,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,
402704,G3,1,0,0,3,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,https://gnps.ucsd.edu/ProteoSAFe/result.jsp?vi...,...,,,,,,,,,,


In [36]:
#Create the master GNPS CMN table 
dereplicator_master = pd.merge(annotation_CMN_dereplicator, annotation_CMN_dereplicatorplus, on="cluster index", how='outer')
dereplicator_master.iloc[5]

DEREP_LocalSpecIdx                                                 26855
DEREP_SpectrumMass                                               985.549
DEREP_Name                                                stylissamide_X
DEREP_Score                                                            9
DEREP_P-Value                                                    2.4e-19
DEREP_FDR                                                              0
DEREP_Adduct                                                         M+H
DEREP_SMILES           [C@@H]12N(C(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H]...
DEREP+_LocalSpecIdx                                                  NaN
DEREP+_SpectrumMass                                                  NaN
DEREP+_Name                                                          NaN
DEREP+_Score                                                         NaN
DEREP+_FDR                                                           NaN
DEREP+_Adduct                                      

In [37]:
# Create the final CMN master feature metadata table
final_master_table = pd.merge(master_CMN_annotation_table, dereplicator_master,
                              on="cluster index", how='outer')
final_master_table.sort_values("DEREP+_LocalSpecIdx", inplace=True)

In [41]:
# Save the table
final_master_table.to_csv('annotations_2_1/CMN_metabo_feature_metadata.tsv', sep='\t', index=True)