In [1]:
import matchms
import os
from omigami import Spec2Vec,MS2DeepScore
import pandas as pd

spec2vec = Spec2Vec()
ms2deepscore = MS2DeepScore()

ModuleNotFoundError: No module named 'omigami'

# Helper functions:

In [2]:
def drop_null_spectra(spectra):
    ''''
    Drops spectra which have no peaks.
    
    '''
    Valid_Spectra = [spectrum for spectrum in spectra if len(spectrum.peaks.mz)>0]
    return Valid_Spectra
    
def ppm_calc(mass,mass_error=5):
    """
    Calculates the mass range allowable under a given ppm mass error restriction. 

    Requirements: 
    mass(float): the mass to use as the benchmark
    
    Optional: 
    mass_error(int): the mass error window. Defaults to 5ppm (publication limit)

    Returns: 
    (list): The mass window.  
    
    """
    Pos_error = (mass_error*0.000001*mass)+mass
    Neg_Error = (-mass_error*0.000001*mass)+mass
    return [Neg_Error,Pos_error]
    
def Clean_Metadata(Input_Dataframe):
    New_Dataframe=Input_Dataframe.copy()
    for i in New_Dataframe.index.tolist():
        GNPS_Record= GNPS_Condensed[GNPS_Condensed['spectrum_id']==i]
        parent_mass = GNPS_Record.iloc[0]['ExactMass']
        precursor_mz = GNPS_Record.iloc[0]['Precursor_MZ']
        New_Dataframe.at[i,'ExactMass']=parent_mass
        New_Dataframe.at[i,'precursor_mz']=precursor_mz
    return New_Dataframe

def Retrieve_by_pepmass (spectra,mass,round_to=3):
    return_list = []
    if type(mass)== float:
        for i in range(len(spectra)):
            if round(spectra[i].metadata['pepmass'][0],round_to) == mass:
                return_list.append(i)
    elif type(mass)== tuple:
        mass1 = mass[0]
        mass2 = mass[1]
        for i in range(len(spectra)):
            if mass1 < spectra[i].metadata['pepmass'][0] <mass2:
                return_list.append(i)
    return return_list

def Create_Annotation_Results (spectra, spectra_matches,ion_mode='positive',cutoff_score=0.70,mass_error=5,Savefile=None):
    """
    Creates a list of Annotations by filtering results with a cutoff score and mass error (observed vs records)
    
    """
    High_Match_Dict ={}
    Number_of_spectra = len(spectra_matches)
    for i in range(len(spectra_matches)):
        print(f'There are {Number_of_spectra} left to process.')
        High_matches = spectra_matches[i][spectra_matches[i]['score']>cutoff_score]
        if len(High_matches)>0:
            PEPMASS = spectra[i].metadata['pepmass'][0]
            if ion_mode == 'positive':
                Mass_low,Mass_high = ppm_calc(PEPMASS-1.0078,mass_error=mass_error)
            elif ion_mode =='negative':
                Mass_low,Mass_high = ppm_calc(PEPMASS+1.0078,mass_error=mass_error)
            Process_Result = Clean_Metadata(High_matches)
            Refined_Results = Process_Result[Process_Result['ExactMass'].between(Mass_low,Mass_high)]
            if len(Refined_Results)>0:
                High_Match_Dict[i]=(Refined_Results,"Match Via Exact Mass")
            else:
                Mass_low,Mass_high = ppm_calc(PEPMASS,mass_error=mass_error)
                Refined_Results = Process_Result[Process_Result['precursor_mz'].between(Mass_low,Mass_high)]
                if len(Refined_Results)>0:
                    High_Match_Dict[i]=(Refined_Results,"Match Via Precursor m/z")
        Number_of_spectra-=1
    Annotation_Results = pd.DataFrame(columns=['PEPMASS','Compound_Name','GNPS_Hit'])
    for i in High_Match_Dict: 
        Mass = spectra[i].metadata["pepmass"][0]
        name = High_Match_Dict[i][0].iloc[0]['compound_name']
        DB_spectrum = High_Match_Dict[i][0].iloc[0].name
        Hit_DF = pd.DataFrame.from_dict({'PEPMASS':[Mass],'Compound_Name':[name],'GNPS_Hit':[DB_spectrum]},orient='columns')
        Annotation_Results = pd.concat([Annotation_Results,Hit_DF],ignore_index=True)
    if Savefile:
        Annotation_Results.to_csv(Savefile)
    return Annotation_Results

# General notes about working with these data: 

* Positive mode data were downloaded from https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=39ed9b19f2e348f1b45bb8c78e11c16d and pertain to MSV000087011. 
* The "Download clustered spectra as MGF file" was used to derive the MSV000087011.mgf
* This MGF contained null spectra which contained no peaks. -> drop_null_spectra was created and used, resulting in 787 nodes (verified against manuscript).

* The same was repeated for MSV00008622 (https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=692236f015914501a9b0f2286c1df47f)

# Data filtration: 

Data are filtered from the mgf file imported from the molecular network to drop null spectra and renamed. **Do not run** unless you need it explicitly. 

In [60]:

# f = os.path.join("MSV000087011","MSV000087011.mgf")
f= "METABOLOMICS-SNETS-V2-692236f0-download_clustered_spectra-main.mgf" # Change to destination of file as needed. 

spectra = list(matchms.importing.load_from_mgf(f))
spectrum = spectra[0]
spectra = drop_null_spectra(spectra)
matchms.exporting.save_as_mgf(spectra,'MSV000087012_Filtered.mgf')

## Creating the condensed GNPS.json 

This is done because the condensed json file is less taxing on memory than the whole thing and we only need a few columns. **Again, do not run unless explicitly needed**

In [None]:

GNPS_Structure_DF = pd.read_json('ALL_GNPS.json')
GNPS_Structure_DF.columns
selected_columns = GNPS_Structure_DF[['spectrum_id','Precursor_MZ','ExactMass','Compound_Name','Smiles','INCHI']]
GNPS_Condensed = selected_columns.copy()
GNPS_Condensed.to_json('GNPS_Condensed.json')

# Data Processing

In [3]:
path_to_mgf='MSV000087011_Filtered.mgf'
spectra = list(matchms.importing.load_from_mgf(path_to_mgf))
len(spectra) # should be 787 for the positive mode data. 

787

In [6]:
spectra_matches = spec2vec.match_spectra_from_path(
    path_to_mgf, 
    n_best=100, 
    ion_mode="positive",
    include_metadata=["smiles", "compound_name",'parent_mass']
)
# Change to ms2deepscore when needed. 

[                            score                  compound_name parent_mass  \
 matches of spectrum-0                                                          
 CCMSLIB00005759013       0.425402              3-Aminoisobutyrat     103.064   
 CCMSLIB00000223211       0.425402      alpha-Methyl-beta-alanine     103.064   
 CCMSLIB00000221511       0.425361  3-amino-2-methylpropanoic aci     103.064   
 CCMSLIB00000577917        0.28621            3-AMINOISOBUTANOATE     103.063   
 CCMSLIB00005720608       0.241379               4-AMINOBUTANOATE     103.064   
 ...                           ...                            ...         ...   
 CCMSLIB00006552053     0.00998506                   Malonic acid     104.013   
 CCMSLIB00006363486      0.0094881                   Malonic acid     104.013   
 CCMSLIB00005463935     0.00930952             2-AMINOISOBUTYRATE     103.064   
 CCMSLIB00005755952     0.00928514         2-Aminoisobutyric acid     103.073   
 CCMSLIB00000214951     0.00

In [9]:
GNPS_Condensed = pd.read_json('GNPS_Condensed.json')
Annotation_Results = Create_Annotation_Results(spectra, spectra_matches,ion_mode='positive',cutoff_score=0.7,mass_error=20)
Manual_Annotations = pd.read_csv('Manual_Annotations.csv') #positive (extra column due to manual eval of data. )
Manual_Annotations.columns = ['PEPMASS','MN_Annotation',"Manual_ID",'Truth']

In [16]:
Comparison_Table = pd.merge(Annotation_Results.round(2),Manual_Annotations.round(2),how='inner')
# Comparison_Table.to_csv('S2V_20_ppm_vs_Manual_positive.csv') #option to save. 

## Streamlined negative mode implimentation

In [None]:
path_to_mgf = os.path.join("MSV000087012_Filtered.mgf")
spectra = list(matchms.importing.load_from_mgf(path_to_mgf))
spectra_matches = spec2vec.match_spectra_from_path(
    path_to_mgf, 
    n_best=100, 
    ion_mode="negative",
    include_metadata=["smiles", "compound_name",'parent_mass']
)
Annotation_Results = Create_Annotation_Results(spectra, spectra_matches,ion_mode='negative',cutoff_score=0.7,mass_error=10)
Manual_Annotations = pd.read_csv('Manual_Annotations_Negative_mode.csv') 
Manual_Annotations.columns = ['PEPMASS','MN_Annotation',"Manual_ID"]
Comparison_Table = pd.merge(Annotation_Results.round(2),Manual_Annotations.round(2),how='inner') 