# Generate predictions
The predictions are generated and compared to the correct results. 

This is done on 2 test sets:
- 3000 spectra with an exact match
- spectra of 250 inchikeys without an exact match

This is done using 3 different methods:
- MS2Query
- MS2Deepscore
- Cosine score

The output stores the scores for the highest library hit for each test spectrum. The results are stored in the following format:
[(highest_scoring_spectrum_id, predicted_score)]
THe highest scoring spectrum id is used to later identify the best hit. The predicted score is the MS2Query score, MS2Deepscore score or (modified) cosine score. 

In [1]:
import os
from ms2query.utils import load_pickled_file
path_data = "C:/Users/jonge094/PycharmProjects/PhD_MS2Query/ms2query/data"

test_spectra_3000 = load_pickled_file(os.path.join(path_data,
                                                   "libraries_and_models/gnps_15_12_2021/in_between_files/GNPS_15_12_2021_pos_test_3000_spectra.pickle"))
test_spectra_250 = load_pickled_file(os.path.join(path_data,
                                                   "libraries_and_models/gnps_15_12_2021/in_between_files/GNPS_15_12_2021_pos_test_250_inchikeys.pickle"))


# MS2Query
The code below was run on the server and the results are loaded.

In [2]:
import os
from ms2query.run_ms2query import run_complete_folder
from ms2query.ms2library import MS2Library

path_root = os.path.dirname(os.getcwd())
path_library = os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/library_gnps_15_12/")
ms2_spectra_directory = os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/test_spectra/")

# todo rename msds file name for end file.
# Create a MS2Library object
ms2library = MS2Library(sqlite_file_name=os.path.join(path_library, "library_GNPS_15_12_2021.sqlite"),
                        s2v_model_file_name=os.path.join(path_library, "spec2vec_model_GNPS_15_12_2021.model"),
                        ms2ds_model_file_name=os.path.join(path_library, "ms2ds_model_GNPS_15_12_2021.hdf5"),
                        pickled_s2v_embeddings_file_name=os.path.join(path_library, "library_GNPS_15_12_2021_s2v_embeddings.pickle"),
                        pickled_ms2ds_embeddings_file_name=os.path.join(path_library, "library_GNPS_15_12_2021_ms2ds_embeddings.pickle"),
                        ms2query_model_file_name=os.path.join(path_library, "ms2query_random_forest_model.pickle"),
                        classifier_csv_file_name=os.path.join(path_root, "../data/libraries_and_models/gnps_09_04_2021/ALL_GNPS_210409_positive_processed_annotated_CF_NPC_classes.txt"))

# Run library search and analog search on your files.
run_complete_folder(ms2library, ms2_spectra_directory)


# MS2Deepscore

MS2Deepscore is selected 

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
from ms2deepscore.models import load_model as load_ms2ds_model
from ms2deepscore import MS2DeepScore
from spec2vec.vector_operations import calc_vector, cosine_similarity_matrix
from ms2query.query_from_sqlite_database import get_precursor_mz_within_range

def get_all_ms2ds_scores(ms2ds_model_file_name, ms2ds_embeddings_file_name, query_spectra
                          ) -> pd.DataFrame:
    """Returns a dataframe with the ms2deepscore similarity scores

    The similarity scores are calculated between the query_spectra and all
    library spectra.

    query_spectra
        Spectra for which similarity scores should be calculated for all
        spectra in the ms2ds embeddings file.
    """
    ms2ds_embeddings = load_pickled_file(pickled_ms2ds_embeddings_file_name)
    ms2ds_model = load_ms2ds_model(ms2ds_model_file_name)
    ms2ds = MS2DeepScore(ms2ds_model, progress_bar=False)
    query_embeddings = ms2ds.calculate_vectors(query_spectra)
    library_ms2ds_embeddings_numpy = ms2ds_embeddings.to_numpy()

    ms2ds_scores = cosine_similarity_matrix(library_ms2ds_embeddings_numpy,
                                            query_embeddings)
    similarity_matrix_dataframe = pd.DataFrame(
        ms2ds_scores,
        index=ms2ds_embeddings.index)
    return similarity_matrix_dataframe

def select_highest_ms2ds_in_mass_range(ms2deepscores, test_spectra, sqlite_file_location, allowed_mass_diff):
    highest_ms2_deepscore_in_mass_range = []
    for i, spectrum in tqdm(enumerate(test_spectra)):
        precursor_mz_query_spectrum = spectrum.get("precursor_mz")
        spectra_and_mass = get_precursor_mz_within_range(sqlite_file_location, precursor_mz_query_spectrum-allowed_mass_diff, precursor_mz_query_spectrum+allowed_mass_diff)
        spectra = [spectrum_and_mass[0] for spectrum_and_mass in spectra_and_mass]
        highest_ms2_deepscore_in_mass_range.append(ms2deepscores[i].loc[spectra].idxmax())
    return highest_ms2_deepscore_in_mass_range

In [4]:
path_data = "C:/Users/jonge094/PycharmProjects/PhD_MS2Query/ms2query/data"
path_library = os.path.join(path_data, "libraries_and_models/gnps_15_12_2021/library_gnps_15_12/")
pickled_ms2ds_embeddings_file_name = os.path.join(path_library, "library_GNPS_15_12_2021_ms2ds_embeddings.pickle")
ms2ds_model_file_name=os.path.join(path_library, "ms2ds_model_GNPS_15_12_2021.hdf5")

### Run for test spectra 250

A mass difference of 100 Da is used and the highest MS2Deepscore is selected

In [None]:
ms2deepscores = get_all_ms2ds_scores(ms2ds_model_file_name, pickled_ms2ds_embeddings_file_name, test_spectra_250)

In [10]:
def select_highest_ms2ds_in_mass_range(ms2deepscores, test_spectra, sqlite_file_location, allowed_mass_diff):
    highest_ms2_deepscore_in_mass_range = []
    for i, spectrum in tqdm(enumerate(test_spectra)):
        precursor_mz_query_spectrum = spectrum.get("precursor_mz")
        spectra_and_mass = get_precursor_mz_within_range(sqlite_file_location, precursor_mz_query_spectrum-allowed_mass_diff, precursor_mz_query_spectrum+allowed_mass_diff)
        spectra = [spectrum_and_mass[0] for spectrum_and_mass in spectra_and_mass]
        highest_ms2_deepscore_in_mass_range.append(ms2deepscores[i].loc[spectra].idxmax())
    return highest_ms2_deepscore_in_mass_range

In [None]:
highest_ms2deepscore_within_100_Da = select_highest_ms2ds_in_mass_range(ms2deepscores, test_spectra_250, os.path.join(path_library, "library_GNPS_15_12_2021.sqlite"), 100)

The MS2Deepscore is also stored, to potentially use as as a threshold in a later state.

In [31]:
best_spectrum_and_ms2deepscore = []
for i, spectrum_id in enumerate(highest_ms2deepscore_within_100_Da):
    selected_ms2deepscore = ms2deepscores[i].loc[spectrum_id]
    best_spectrum_and_ms2deepscore.append((spectrum_id, selected_ms2deepscore))

In [33]:
import pickle
pickle.dump(best_spectrum_and_ms2deepscore, open(os.path.join(path_data, "libraries_and_models/gnps_15_12_2021/benchmarking/ms2deepscores_within_100_Da_test_spectra_250.pickle"),"wb"))

### Run for test spectra 3000
All spectra within a mass diff of 1 are selected.

In [5]:
ms2deepscores = get_all_ms2ds_scores(ms2ds_model_file_name, pickled_ms2ds_embeddings_file_name, test_spectra_3000)

In [16]:
def select_highest_ms2ds_in_mass_range(ms2deepscores, test_spectra, sqlite_file_location, allowed_mass_diff):
    highest_ms2_deepscore_in_mass_range = []
    for i, spectrum in tqdm(enumerate(test_spectra)):
        precursor_mz_query_spectrum = spectrum.get("precursor_mz")
        spectra_and_mass = get_precursor_mz_within_range(sqlite_file_location, precursor_mz_query_spectrum-allowed_mass_diff, precursor_mz_query_spectrum+allowed_mass_diff)
        spectra = [spectrum_and_mass[0] for spectrum_and_mass in spectra_and_mass]
        if len(spectra) != 0:
            highest_ms2_deepscore_in_mass_range.append(ms2deepscores[i].loc[spectra].idxmax())
        else:
            highest_ms2_deepscore_in_mass_range.append(None)
    return highest_ms2_deepscore_in_mass_range

In [19]:
highest_ms2deepscore_within_0_25_Da = select_highest_ms2ds_in_mass_range(ms2deepscores, test_spectra_3000, os.path.join(path_library, "library_GNPS_15_12_2021.sqlite"), 0.25)

0it [00:00, ?it/s]

In [21]:
best_spectrum_and_ms2deepscore = []
for i, spectrum_id in enumerate(highest_ms2deepscore_within_0_25_Da):
    if spectrum_id is not None:
        selected_ms2deepscore = ms2deepscores[i].loc[spectrum_id]
        best_spectrum_and_ms2deepscore.append((spectrum_id, selected_ms2deepscore))
    else:
        best_spectrum_and_ms2deepscore.append((None, None))

In [22]:
best_spectrum_and_ms2deepscore

[('CCMSLIB00000006313', 0.7583803411877641),
 ('CCMSLIB00005463475', 0.681292296999443),
 ('CCMSLIB00005724297', 0.6189994721423024),
 ('CCMSLIB00000853340', 0.7828237758851809),
 (None, None),
 ('CCMSLIB00000005095', 0.8550851115337204),
 ('CCMSLIB00000075013', 0.562564343895491),
 ('CCMSLIB00000007098', 0.9999999999999342),
 ('CCMSLIB00000068232', 0.8481030205580182),
 ('CCMSLIB00004686427', 0.6639036226056667),
 ('CCMSLIB00006395642', 0.790810761862822),
 ('CCMSLIB00000848685', 0.9110227891046291),
 ('CCMSLIB00006686346', 0.8445271835154597),
 ('CCMSLIB00006533998', 0.834128464222159),
 ('CCMSLIB00006481807', 0.907370364755867),
 ('CCMSLIB00006469418', 0.7162316287798675),
 ('CCMSLIB00006547816', 0.8406311640477768),
 ('CCMSLIB00000075068', 0.9493142109505874),
 ('CCMSLIB00000075336', 0.9552921544676285),
 ('CCMSLIB00000075315', 0.7132792874923419),
 ('CCMSLIB00000077256', 0.824762951232503),
 ('CCMSLIB00006515930', 0.7278928482311822),
 ('CCMSLIB00000574475', 0.9757081570446855),
 

In [23]:
import pickle
pickle.dump(best_spectrum_and_ms2deepscore, open(os.path.join(path_data, "libraries_and_models/gnps_15_12_2021/benchmarking/ms2deepscores_within_0_25_Da_test_spectra_3000.pickle"),"wb"))

# Modified Cosine

The following code was run on the server, to generate the analogue search results.

In [None]:
import os
import pickle
from matchms.calculate_scores import calculate_scores
from matchms.similarity.ModifiedCosine import ModifiedCosine
from tqdm import tqdm
from ms2query.query_from_sqlite_database import get_spectra_from_sqlite
from ms2query.utils import load_pickled_file

def select_spectra_within_mass_range(spectra, lower_bound, upper_bound):
    selected_spectra = []
    for spectrum in spectra:
        precursor_mz = spectrum.get('precursor_mz')
        if precursor_mz <= upper_bound and precursor_mz >= lower_bound:
            selected_spectra.append(spectrum)
    return selected_spectra

def analogue_search(lib_spectra, test_spectra, mass_tolerance = 100):
    best_matches_for_test_spectra = []
    for test_spectrum in tqdm(test_spectra):
        precursor_mz = test_spectrum.get("precursor_mz")
        selected_lib_spectra = select_spectra_within_mass_range(lib_spectra, precursor_mz-mass_tolerance, precursor_mz+mass_tolerance)
        scores_list = calculate_scores(selected_lib_spectra, [test_spectrum], ModifiedCosine()).scores_by_query(test_spectrum)
        cosine_scores = [scores_tuple[1][0] for scores_tuple in scores_list]
        highest_cosine_score = max(cosine_scores)
        highest_scoring_spectrum = scores_list[cosine_scores.index(highest_cosine_score)][0]
        highest_scoring_spectrum_id = highest_scoring_spectrum.get("spectrumid")
        best_matches_for_test_spectra.append((highest_scoring_spectrum_id, highest_cosine_score))
    return best_matches_for_test_spectra

path_root = os.path.dirname(os.getcwd())
path_library = os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/library_gnps_15_12/")

test_spectra_250 = load_pickled_file(os.path.join(path_root,
                                                   "../../data/libraries_and_models/gnps_15_12_2021/in_between_files/GNPS_15_12_2021_pos_test_250_inchikeys.pickle"))
# Load all library spectra in memory
all_lib_spectra = get_spectra_from_sqlite(os.path.join(path_library, "library_GNPS_15_12_2021.sqlite"), [], get_all_spectra=True)
results = analogue_search(all_lib_spectra, test_spectra_250, mass_tolerance=100)

pickle.dump(results, open(os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/benchmarking/highest_mod_cosine_mass_tol_100_test_spectra_250.pickle"), "wb"))

The following code generates the exact library matching results.

In [97]:
import os
import pickle
from matchms.calculate_scores import calculate_scores
from matchms.similarity.CosineGreedy import CosineGreedy
from tqdm import tqdm
from ms2query.query_from_sqlite_database import get_spectra_from_sqlite
from ms2query.utils import load_pickled_file

def select_spectra_within_mass_range(spectra, lower_bound, upper_bound):
    selected_spectra = []
    for spectrum in spectra:
        precursor_mz = spectrum.get('precursor_mz')
        if precursor_mz <= upper_bound and precursor_mz >= lower_bound:
            selected_spectra.append(spectrum)
    return selected_spectra

def analogue_search(lib_spectra, test_spectra, mass_tolerance, fragment_mass_tolerance, minimum_matched_peaks):
    best_matches_for_test_spectra = []
    for test_spectrum in tqdm(test_spectra):
        precursor_mz = test_spectrum.get("precursor_mz")
        selected_lib_spectra = select_spectra_within_mass_range(lib_spectra, precursor_mz-mass_tolerance, precursor_mz+mass_tolerance)
        if len(selected_lib_spectra) != 0:
            scores_list = calculate_scores(selected_lib_spectra, [test_spectrum], CosineGreedy(tolerance=fragment_mass_tolerance)).scores_by_query(test_spectrum)
            cosine_scores = [scores_tuple[1].item()[0] for scores_tuple in scores_list if scores_tuple[1].item()[1] >= minimum_matched_peaks]
            if len(cosine_scores) != 0:
                highest_cosine_score = max(cosine_scores)
                highest_scoring_spectrum = scores_list[cosine_scores.index(highest_cosine_score)][0]
                highest_scoring_spectrum_id = highest_scoring_spectrum.get("spectrumid")
                best_matches_for_test_spectra.append((highest_scoring_spectrum_id, highest_cosine_score))
            else:
                best_matches_for_test_spectra.append((None, None))
        else:
            best_matches_for_test_spectra.append((None, None))
    return best_matches_for_test_spectra


In [95]:
path_root = os.path.dirname(os.getcwd())
path_library = os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/library_gnps_15_12/")
test_spectra_3000 = load_pickled_file(os.path.join(path_root,
                                                   "../../data/libraries_and_models/gnps_15_12_2021/in_between_files/GNPS_15_12_2021_pos_test_3000_spectra.pickle"))

In [None]:
# Load all library spectra in memory
all_lib_spectra = get_spectra_from_sqlite(os.path.join(path_library, "library_GNPS_15_12_2021.sqlite"), [], get_all_spectra=True)

In [98]:
results = analogue_search(all_lib_spectra, test_spectra_3000, mass_tolerance=0.25, fragment_mass_tolerance =0.05, minimum_matched_peaks = 3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [16:01<00:00,  3.12it/s]


In [99]:
pickle.dump(results, open(os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/benchmarking/highest_cosine_mass_tol_025_fragment_005_min_matched_3_test_spectra_3000.pickle"), "wb"))

In [100]:
results = analogue_search(all_lib_spectra, test_spectra_3000, mass_tolerance=0.25, fragment_mass_tolerance =0.05, minimum_matched_peaks = 0)
pickle.dump(results, open(os.path.join(path_root, "../../data/libraries_and_models/gnps_15_12_2021/benchmarking/highest_cosine_mass_tol_025_fragment_005_min_matched_0_test_spectra_3000.pickle"), "wb"))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [16:08<00:00,  3.10it/s]
