## DEMO USAGE: GNPS downloader and post-processing

In [1]:
import sys 
sys.path.append('../src')
from consolidates_structures import consolidate_and_convert_structures
from gnps_download_results import gnps_download_results
from gnps_results_postprocess import *

## Download classical molecular networking

In [3]:
# Classical Mol Net
gnps_annotations = gnps_download_results(job_id = 'bbee697a63b1400ea585410fafc95723', output_folder = 'gnps_results', return_annotation_table=True)
gnps_annotations.head(2)

This is the GNPS job link: https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=bbee697a63b1400ea585410fafc95723
Downloading the following content: https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=bbee697a63b1400ea585410fafc95723&view=view_all_annotations_DB
GNPS job results were succesfully downloaded as: gnps_results.zip
GNPS job results were succesfully extracted into the folder: gnps_results
   CLASSICAL MOLECULAR NETWORKING job detected
      199 spectral library annotations in the job.
      9643 nodes in the network (including single nodes)


Unnamed: 0,#Scan#,Adduct,CAS_Number,Charge,Compound_Name,Compound_Source,Data_Collector,ExactMass,FileScanUniqueID,INCHI,...,RT_Query,SharedPeaks,Smiles,SpecCharge,SpecMZ,SpectrumFile,SpectrumID,TIC_Query,UpdateWorkflowName,tags
0,100631,[M+H]+,,1,MoNA:594132 Octocrylene,isolated,MoNA,0.0,spectra/specs_ms.pklbin100631,InChI=1S/C24H27NO2/c1-3-5-12-19(4-2)18-27-24(2...,...,73.567,5,,0,362.208,spectra/specs_ms.pklbin,CCMSLIB00000566191,1910.37,UPDATE-SINGLE-ANNOTATED-BRONZE,
1,100637,[M+H]+,,1,MoNA:594132 Octocrylene,isolated,MoNA,0.0,spectra/specs_ms.pklbin100637,InChI=1S/C24H27NO2/c1-3-5-12-19(4-2)18-27-24(2...,...,433.325,7,,0,362.212,spectra/specs_ms.pklbin,CCMSLIB00000566191,31304.2,UPDATE-SINGLE-ANNOTATED-BRONZE,


## Download feature-based molecular networking

In [5]:
# Feature-Based Molecular Networking
gnps_annotations = gnps_download_results(job_id = '2047c735fc3546f7a3a32c78245edccf', output_folder = 'gnps_results_fbmn', return_annotation_table=True)
gnps_annotations.head(2)

This is the GNPS job link: https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=2047c735fc3546f7a3a32c78245edccf
Downloading the following content: https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task=2047c735fc3546f7a3a32c78245edccf&view=view_all_annotations_DB
GNPS job results were succesfully downloaded as: gnps_results_fbmn.zip
GNPS job results were succesfully extracted into the folder: gnps_results_fbmn
   FEATURE-BASED MOLECULAR NETWORKING job detected - Version > 28
      206 spectral library annotations in the job.
      960 nodes in the network (including single nodes).


Unnamed: 0,SpectrumID,Compound_Name,Ion_Source,Instrument,Compound_Source,PI,Data_Collector,Adduct,Precursor_MZ,ExactMass,...,MoleculeExplorerDatasets,MoleculeExplorerFiles,InChIKey,InChIKey-Planar,superclass,class,subclass,npclassifier_superclass,npclassifier_class,npclassifier_pathway
0,CCMSLIB00003181672,Terpineol,ESI,Q-TOF,Isolated,NIST,NIST,M+H-H2O,137.133,0.0,...,0,0,WUOACPNHFRMFPN-UHFFFAOYSA-N,WUOACPNHFRMFPN,Lipids and lipid-like molecules,Prenol lipids,Monoterpenoids,Monoterpenoids,Menthane monoterpenoids|Monocyclic monoterpenoids,Terpenoids
1,CCMSLIB00000500284,3-Copaene,LC-ESI,LC-ESI-QTOF,Isolated,Metlin,,M+H,205.195,0.0,...,0,0,,,,,,,,


## Consolidate structures

In [6]:
gnps_annotations_consolidated  = consolidate_and_convert_structures(gnps_annotations, prefix='', smiles='Smiles', inchi='INCHI')
gnps_annotations.head(1)

Both SMILES and InChI were inputted
Converting SMILES to mol object
Succesfully converted to mol object: 148
Exception to the parsing: 0
Not available: 59
Converting INCHI to mol object
Succesfully converted to mol object: 155
Exception to the parsing: 0
Not available: 52
Consolidating the lists
Total mol object from the list 1 = 148
Mol object consolidated from list 2 = 9
Consolidated structures = 157
Converting mol objects to SMILES iso
Converting mol objects to SMILES
Converting mol objects to InChI
Converting mol objects to InChIKey
End


Unnamed: 0,SpectrumID,Compound_Name,Ion_Source,Instrument,Compound_Source,PI,Data_Collector,Adduct,Precursor_MZ,ExactMass,...,superclass,class,subclass,npclassifier_superclass,npclassifier_class,npclassifier_pathway,Consol_SMILES_iso,Consol_SMILES,Consol_InChIKey,Consol_InChI
0,CCMSLIB00003181672,Terpineol,ESI,Q-TOF,Isolated,NIST,NIST,M+H-H2O,137.133,0.0,...,Lipids and lipid-like molecules,Prenol lipids,Monoterpenoids,Monoterpenoids,Menthane monoterpenoids|Monocyclic monoterpenoids,Terpenoids,CC1=CCC(C(C)(C)O)CC1,CC1=CCC(C(C)(C)O)CC1,WUOACPNHFRMFPN-UHFFFAOYSA-N,"InChI=1S/C10H18O/c1-8-4-6-9(7-5-8)10(2,3)11/h4..."


## Filter annotations

In [9]:
gnps_annotations_filtered = gnps_filter_annotations(gnps_annotations_consolidated, 'Consol_InChI', ionisation_mode = 'pos', max_ppm_error=10, min_cosine=0.6, shared_peaks = 6, max_spec_charge = 1)

Initial number of annotations: 207
Remaining after ionisation mode filtering: 207
Remaining after max_ppm_error filtering: 193
Remaining after min_cosine filtering: 193
Remaining after number of shared_peaks filtering: 165
Remaining after number of spectrum charge filtering: 165


## Clean up annotations

In [10]:
cleaned_up_gnps_annotations = gnps_clean_up_annotations(gnps_annotations_filtered, 'Consol_InChI', remove_C_containing_in_source_fragment = True)

Initial number of annotations: 165
After removing annotations without structure: 117
After intrinsically charged molecules removed: 117
After carbon containing adducts filtering: 98


## Get molecular formula proxy

In [11]:
cleaned_up_gnps_annotations_formula = get_molecular_formula_from_inchi(cleaned_up_gnps_annotations, 'Consol_InChI')

Initial number of annotations filtering: 98
After carbon containing adducts filtering: 98
Valid molecular formula: 98
