In [None]:
%load_ext autoreload
%autoreload 2
from specxploreImporting import importing
import matchms
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
from ms2query.run_ms2query import download_zenodo_files, run_complete_folder
from ms2query.ms2library import create_library_object_from_one_dir
import os
import pandas as pd
import numpy as np
from typing import List

In [None]:
models_and_library_folder_pos = os.path.join("data_and_output", "models", "ms2query_models_and_library_pos")
raw_experimental_mgf_filename = os.path.join("data_and_output", "wheat_data", "MonoisotopicMS2Feature_pos_mostIntense.mgf")
raw_standards_mgf_filename = os.path.join("data_and_output", "wheat_data", "phophe_pos.mgf")
processed_experimental_spectra_filename = os.path.join("data_and_output", "wheat_data", "spectra_experimental.mgf")
processed_standard_spectra_filename = os.path.join("data_and_output", "wheat_data", "spectra_standards.mgf")
csv_experimental_filename = os.path.join("data_and_output", "wheat_data", "wheat_data_ms2query_out_experimental.csv")
csv_standards_filename = os.path.join("data_and_output", "wheat_data", "wheat_data_ms2query_out_standard.csv")
scores_s2v_filename = os.path.join("data_and_output", "wheat_data", "s2v.npy")
scores_ms2ds_filename = os.path.join("data_and_output", "wheat_data", "ms2ds.npy")
scores_cos_filename = os.path.join("data_and_output", "wheat_data", "cos.npy")

# Process Raw Data


In [None]:
spectra_experimental = list(load_from_mgf(raw_experimental_mgf_filename))
spectra_experimental = importing.clean_spectra(spectra_experimental)


spectra_standards = list(load_from_mgf(raw_standards_mgf_filename))
spectra_standards = importing.clean_spectra(spectra_standards)

if False:
    save_as_mgf(spectra_experimental, processed_experimental_spectra_filename)
    save_as_mgf(spectra_standards, processed_standard_spectra_filename)

# Run Classification using ms2query

In [None]:
# WARNING
# THIS DOES WORK, BUT AUTOMATICALLY RUNS BOTH STANDARDS AND EXPERIMENTAL SPECTRA IF IN SAME FOLDER
# UPDATE TO MS2QUERY LATEST VERSION TO ALLOW SINGLE FILE INPUT AND OUTPUT
# ONCE UPDATED, FILENAMES CAN ALSO BE SPECIFIED....
if False:
    ms2library = create_library_object_from_one_dir(models_and_library_folder_pos)
    importing.run_single_file(ms2library, processed_standard_spectra_filename, str(csv_standards_filename))
    importing.run_single_file(ms2library, processed_experimental_spectra_filename, str(csv_experimental_filename))

# Merge Classes Tables and Spectra lists

In [None]:
classes_experimental = pd.read_csv(csv_experimental_filename)
classes_experimental = importing.expand_ms2query_results_table(classes_experimental, len(spectra_experimental))
classes_standards = pd.read_csv(csv_standards_filename)
classes_standards = importing.expand_ms2query_results_table(classes_standards, len(spectra_standards))

In [None]:
classes_experimental["is_standard"] = False
classes_experimental["exp_metadata"] = None
classes_standards["is_standard"] = True
classes_standards["standard_metadata"] = None
all_class_table = pd.merge(classes_experimental, classes_standards, how = 'outer').reset_index()
all_class_table["specxplore_id"] = all_class_table.index
all_class_table

In [None]:
all_spectra = spectra_experimental + spectra_standards # list addition

In [None]:
if False:
    scores_s2v = importing.compute_similarities_s2v(all_spectra, models_and_library_folder_pos)
    scores_cos = importing.compute_similarities_cosine(all_spectra, cosine_type="ModifiedCosine")
    scores_ms2ds = importing.compute_similarities_ms2ds(all_spectra, models_and_library_folder_pos)
    np.save(scores_ms2ds_filename, scores_ms2ds, allow_pickle=False)
    np.save(scores_s2v_filename, scores_s2v, allow_pickle=False)
    np.save(scores_cos_filename, scores_cos, allow_pickle=False)

In [None]:
scores_s2v = np.load(scores_s2v_filename)
scores_ms2ds = np.load(scores_ms2ds_filename)
scores_cos = np.load(scores_cos_filename)

scores_heuristic = np.maximum(scores_ms2ds, scores_cos)
scores_ms2ds = scores_heuristic # <-- overwriting ms2ds score here!!!

# Run Kmedoid and tsne

In [None]:
k_values = [10,20,30,40,50,75,100, 125, 150, 175, 200,250, 300, 400,500]
random_seeds = [int(np.random.randint(0,1000)) for _ in k_values]
similarities = scores_ms2ds
distances = importing.convert_similarity_to_distance(similarities)
kmedoid_list = importing.run_kmedoid_grid(distances, k_values, random_seeds)
importing.render_kmedoid_fitting_results_in_browser(kmedoid_list)

In [None]:
perplexity_values = [x for x in range(5, 55, 5)]
random_seeds = [int(np.random.randint(0,1000)) for _ in perplexity_values]
tsne_list_pos = importing.run_tsne_grid(distances, perplexity_values, random_seeds)
importing.render_tsne_fitting_results_in_browser(tsne_list_pos)

# Merge into specxplore object

In [None]:
%load_ext autoreload
%autoreload 2
import specxplore.specxplore_data
from specxplore.specxplore_data import specxplore_data, Spectrum
is_standard = np.array(all_class_table["is_standard"])
spec_classes = all_class_table[['cf_kingdom', 'cf_superclass', 'cf_class', 'cf_subclass',
       'cf_direct_parent', 'npc_class_results', 'npc_superclass_results',
       'npc_pathway_results']]
mz = [spec.get("precursor_mz") for spec in all_spectra]
# KmedoidGridEntry(k, cluster_assignments, score, random_states[idx])
kclass_table = pd.DataFrame()
for elem in kmedoid_list:
    kclass_table[elem.k] = elem.cluster_assignments
classes = pd.concat([spec_classes, kclass_table], axis=1)

In [None]:
tsnedf = pd.DataFrame({"x" : tsne_list_pos[8].x_coordinates, "y" : tsne_list_pos[8].y_coordinates})
specxplore_id = np.array(all_class_table["specxplore_id"])

def convert_matchms_spectra_to_specxplore_spectrum(spectra = List[matchms.Spectrum]) -> List[Spectrum]:
  spectra_converted = [
      specxplore.specxplore_data.Spectrum(spec.peaks.mz, float(spec.get("precursor_mz")), idx, spec.peaks.intensities) 
      for idx, spec in enumerate(all_spectra)]
  return spectra_converted

spectra_converted = convert_matchms_spectra_to_specxplore_spectrum(all_spectra)

wheat_data_specxplore = specxplore_data(
  scores_ms2ds,scores_s2v, scores_cos, tsnedf,  classes,
  is_standard, spectra_converted, mz, specxplore_id, all_class_table)

In [None]:
wheat_data_specxplore.spectra[0]

In [None]:
import pickle
with open('data_and_output/wheat_data/wheat_data_specxplore_v6.pickle', 'wb') as file:
  pickle.dump(wheat_data_specxplore, file)