# Derive spec2vec embeddings of MS/MS spectra

### Imports

In [1]:
import os
import sys
import gensim
import numpy as np

ROOT = os.path.dirname(os.getcwd())
#path_data = os.path.join(ROOT, 'data')
path_data = 'C:\\OneDrive - Netherlands eScience Center\\Project_Wageningen_iOMEGA\\matchms\\data\\'
sys.path.insert(0, ROOT)

### Import dataset to create embeddings from, here: pre-processed dataset "Unique InchiKeys"

In [2]:
from matchms.importing import load_from_json

filename = os.path.join(path_data,'gnps_positive_ionmode_unique_inchikey_cleaned_by_matchms_and_lookups.json')
spectrums = load_from_json(filename)

print("number of spectra:", len(spectrums))

number of spectra: 13717


### Post-processing of data

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import add_losses

In [4]:
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
        
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s

# apply post processing steps to the data
spectrums_postprocessed = [post_process(s) for s in spectrums]

# omit spectrums that didn't qualify for analysis
spectrums_postprocessed = [s for s in spectrums_postprocessed if s is not None]

### Load pretrained spec2vec model
- See for instance: https://doi.org/10.5281/zenodo.4173596 (model pretrained on AllPositive dataset)

In [5]:
path_models = os.path.join(path_data, "trained_models")
model_file = os.path.join(path_models, "spec2vec_AllPositive_ratio05_filtered_201101_iter_15.model")

# Load pretrained model
model = gensim.models.Word2Vec.load(model_file)

### Create spectrum "documents"

In [6]:
from spec2vec import Spec2Vec
from spec2vec import SpectrumDocument 

In [7]:
documents = [SpectrumDocument(s, n_decimals=2) for s in spectrums_postprocessed]

In [8]:
documents[0].words[:20]

['peak@289.29',
 'peak@295.55',
 'peak@298.49',
 'peak@317.32',
 'peak@319.66',
 'peak@324.48',
 'peak@325.32',
 'peak@339.79',
 'peak@343.95',
 'peak@347.02',
 'peak@347.91',
 'peak@361.15',
 'peak@361.84',
 'peak@364.23',
 'peak@364.86',
 'peak@365.85',
 'peak@368.22',
 'peak@368.97',
 'peak@375.07',
 'peak@375.75']

### Derive embeddings

In [9]:
from tqdm.notebook import tqdm  # optional, just to get a progress bar
from spec2vec.vector_operations import calc_vector

In [10]:
intensity_weighting_power = 0.5
allowed_missing_percentage = 10 # specify the maximum (weighted) fraction of the spectrum that is allowed to be missing

vector_size = model.vector_size
print(f"Embedding vector size: {vector_size}")

embeddings_spec2vec = np.zeros((len(documents), vector_size), dtype="float")
for i, doc in enumerate(tqdm(documents)):
    embeddings_spec2vec[i, 0:vector_size] = calc_vector(model, doc,
                                                        intensity_weighting_power,
                                                        allowed_missing_percentage)

Embedding vector size: 300


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12797.0), HTML(value='')))




In [12]:
embeddings_spec2vec.shape

(12797, 300)

In [20]:
print([np.round(x, 4) for x in embeddings_spec2vec[0,:]])

[42.0178, -43.2054, 34.7513, -107.5347, -3.2445, -76.6327, -11.3103, -36.3136, 4.8236, 41.0216, 39.7589, -3.3456, 25.8479, -39.3929, -16.6266, -38.0523, 20.2984, 37.2182, 16.1859, 42.593, 33.4526, -22.1193, 72.8967, 13.4784, -9.9983, 28.5276, -21.382, -4.9061, -1.5627, 16.9605, -54.0134, -28.2718, 9.271, 33.7729, 32.5119, 1.4593, 3.954, 33.8745, -0.9841, -10.5822, 31.8189, -17.6984, 44.6887, -39.6979, 4.4911, -27.5185, -15.1705, 36.0776, 17.4914, 47.657, -37.9565, -2.4548, 0.2419, 41.5399, -51.2658, 19.3386, -44.8592, 7.5528, -20.032, -12.4599, -6.3517, -3.3403, -29.8746, 0.0414, -16.2784, 9.1359, 14.9801, -6.6536, 74.3326, -24.2418, 6.308, 26.0182, -27.0743, -6.403, 30.0604, -2.4306, -25.09, 58.094, 11.1743, 18.9769, -45.2443, 49.2554, 8.8223, -8.9952, -30.1558, 10.2108, -43.2419, -24.7698, 6.6931, 48.0061, 16.3499, 64.5272, 35.6992, 61.1264, 16.8335, 11.5313, 76.2697, 10.0867, 39.2198, -19.8674, -9.7124, -8.2465, -15.4243, -5.7536, -18.4063, -26.6288, 8.6747, -15.5598, 31.7884, 11.60