Jupyter Notebook used to create additional supporting figures for the manuscript supplement. This notebook requires an additional installation of umap-learn using "pip install umap-learn" (version 0.5.6, umap-learn not to be mistaked with the unrelated umap package), in addition to the msfeast python installation.

In [32]:
%load_ext autoreload
%autoreload 2
from matchms.importing import load_from_mgf
from matchms.exporting import save_as_mgf
import pandas as pd
import os
import msfeast.pipeline
import os
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import umap
from msfeast.spectral_comparison import convert_similarity_to_distance
from sklearn.manifold import MDS
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr, pearsonr
from sklearn.manifold import TSNE
from msfeast.process_spectra import extract_feature_ids_from_spectra

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Definining utility functions
from typing import List
def select_random_subset_of_classes (class_assignments : List[str], k : int, seed = 78347834783478):
  """Function extracts a random set of k classes from class assignments, and sets all other classes to 'other' """
  unique_classes = np.unique(class_assignments)
  default_rng = np.random.default_rng(seed)
  selected_classes =  set(default_rng.choice(unique_classes.tolist(), size=k, replace=False, shuffle=True))
  out_classes = [cluster if cluster in selected_classes else "other" for cluster in class_assignments]
  return out_classes

In [3]:
print("Define Filepaths...")
test_data_directory = os.path.join("data", "omsw_pleurotus_ms2deepscore")
filepath_test_spectra = os.path.join(test_data_directory, "spectra.mgf")
filepath_test_quant_table = os.path.join(test_data_directory, "quant_table.csv")
filepath_test_treat_table = os.path.join(test_data_directory, "treat_table.csv")
model_path = os.path.join("..", "models", "ms2deepscore_model.pt")
output_directory = os.path.join(test_data_directory)

Define Filepaths...


In [4]:
print("Initializing pipeline...")
pipeline = msfeast.pipeline.Msfeast()

print("Attaching data...")
treat_table = pd.read_csv(filepath_test_treat_table)
quant_table = pd.read_csv(filepath_test_quant_table)
pipeline.attach_spectra_from_file(filepath_test_spectra, identifier_key="scans")
pipeline.attach_quantification_table(quant_table)
pipeline.attach_treatment_table(treat_table)

Initializing pipeline...
Attaching data...


In [5]:
print("Running spectral similarity computations...")
pipeline.run_and_attach_spectral_similarity_computations("ms2deepscore", model_directory=model_path)

Running spectral similarity computations...
The model version (0.5.0) does not match the version of MS2Deepscore (2.0.0), consider downloading a new model or changing the MS2Deepscore version


1975it [00:09, 207.36it/s]


In [6]:
print("Run kmedoid grid...")
pipeline.run_and_attach_kmedoid_grid([5,10,15,20, 25, 50,100,150,200,250, 300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800, 1900])

Run kmedoid grid...
Kmedoid grid results. Use to inform kmedoid classification selection ilocs.
    iloc     k  silhouette_score  random_seed_used
0      0     5          0.172741                 0
1      1    10          0.184887                 0
2      2    15          0.181562                 0
3      3    20          0.192774                 0
4      4    25          0.190214                 0
5      5    50          0.198892                 0
6      6   100          0.203835                 0
7      7   150          0.206368                 0
8      8   200          0.220599                 0
9      9   250          0.231379                 0
10    10   300          0.241323                 0
11    11   400          0.250223                 0
12    12   500          0.259888                 0
13    13   600          0.264054                 0
14    14   700          0.258923                 0
15    15   800          0.257585                 0
16    16   900          0.250946     

In [41]:
from msfeast.kmedoid_clustering import plot_kmedoid_grid
figure = plot_kmedoid_grid(pipeline.kmedoid_grid)
figure.write_html(os.path.join(output_directory, "silhouette scores" + ".html"))
figure.write_image(os.path.join(output_directory, "silhouette scores" + ".svg"), width = "1000", height="800")
figure

In [8]:
print("Run t-sne grid...")
pipeline.run_and_attach_tsne_grid([5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60,70,80,90,100, 200,300,400,500, 600, 700, 800, 900, 1000, 1250, 1500, 1750])

Run t-sne grid...
T-sne grid results. Use to inform t-sne embedding selection.
    iloc  perplexity  pearson_score  spearman_score  random_seed_used
0      0           5       0.555606        0.558838                 0
1      1          10       0.582606        0.591094                 0
2      2          15       0.581829        0.587025                 0
3      3          20       0.588174        0.563051                 0
4      4          25       0.586727        0.560805                 0
5      5          30       0.589654        0.563378                 0
6      6          35       0.597382        0.574214                 0
7      7          40       0.598774        0.574423                 0
8      8          45       0.602116        0.577223                 0
9      9          50       0.594160        0.565570                 0
10    10          60       0.597985        0.568614                 0
11    11          70       0.609068        0.584109                 0
12    12   

In [42]:
from msfeast.embedding import plot_tsne_grid, plot_selected_embedding
figure = plot_tsne_grid(pipeline.tsne_grid)
filename = os.path.join(output_directory, "t-SNE perplexity grid" + ".html")
figure.write_html(os.path.join(output_directory, "t-SNE perplexity grid" + ".html"))
figure.write_image(os.path.join(output_directory, "t-SNE perplexity grid" + ".svg"), width = "1000", height="800")
figure.show()

In [10]:
selected_iloc = 5
embedding_coordinates_table = pd.DataFrame({"feature_id" : extract_feature_ids_from_spectra(pipeline.spectra_matchms)})
embedding_coordinates_table["x"] = pipeline.tsne_grid[selected_iloc].x_coordinates
embedding_coordinates_table["y"] = pipeline.tsne_grid[selected_iloc].y_coordinates
plot_selected_embedding(embedding_coordinates_table).update_layout(title = "Perplexity = 30")

In [11]:
selected_iloc = 26
embedding_coordinates_table = pd.DataFrame({"feature_id" : extract_feature_ids_from_spectra(pipeline.spectra_matchms)})
embedding_coordinates_table["x"] = pipeline.tsne_grid[selected_iloc].x_coordinates
embedding_coordinates_table["y"] = pipeline.tsne_grid[selected_iloc].y_coordinates
plot_selected_embedding(embedding_coordinates_table).update_layout(title = "Perplexity = 30")

Individual T-sne Runs

In [12]:
distances = convert_similarity_to_distance(pipeline.similarity_array)

In [13]:
def generate_tsne_variant(distances, perplexity, feature_ids, class_assignments, title_string):
  reducer = TSNE(
      metric="precomputed", 
      random_state = 0, 
      init = "random", 
      perplexity = perplexity
  )
  embedding = reducer.fit_transform(distances)
  embedding_coordinates_table = pd.DataFrame({"feature_id" : feature_ids})
  embedding_coordinates_table["x"] = embedding[:, 0]
  embedding_coordinates_table["y"] = embedding[:, 1]
  embedding_coordinates_table["cluster"] = class_assignments
  dist_embedding = squareform(pdist(embedding, 'seuclidean'))
  spearman_score = np.array(spearmanr(distances.flat, dist_embedding.flat))[0]
  pearson_score = np.array(pearsonr(distances.flat, dist_embedding.flat))[0]
  print(f"Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}")
  figure  = plotly.express.scatter(
    data_frame= embedding_coordinates_table, x = "x", y = "y", hover_data=["feature_id"], color = "cluster",
    width=1000, height=800
  )
  figure.update_layout(
    title = f"{title_string} <br><sup>Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}</sup>",
    xaxis_title="X Coordinate Embedding",
    yaxis_title="Y Coordinate Embedding"
  )
  return figure

In [43]:
perplexity = 30
title = f"t-SNE: perplexity = {perplexity}, 30 random classes from a set of 250 are highlighted in color."
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[9].cluster_assignments] # iloc9 --> k = 250 clusters
visualized_classes = select_random_subset_of_classes(class_assignments, 30)
figure = generate_tsne_variant(distances, perplexity, feature_ids, visualized_classes,  title)
figure.update_traces(marker=dict(color='lightgrey'), selector=dict(name='other'))
#figure.update_layout(legend = dict(font = dict(size = 9)))
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.59, Spearman Score: 0.563


In [44]:
perplexity = 30
title = f"t-SNE: perplexity = {perplexity}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_tsne_variant(distances, perplexity, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.59, Spearman Score: 0.563


In [45]:
perplexity = 50
title = f"t-SNE: perplexity = {perplexity}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_tsne_variant(distances, perplexity, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.594, Spearman Score: 0.566


In [46]:
perplexity = 100
title = f"t-SNE: perplexity = {perplexity}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_tsne_variant(distances, perplexity, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.587, Spearman Score: 0.584


In [47]:
perplexity = 1500
title = f"t-SNE: perplexity = {perplexity}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_tsne_variant(distances, perplexity, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.773, Spearman Score: 0.803


# Running umap embeddings with cluster overlays

In [19]:
distances = convert_similarity_to_distance(pipeline.similarity_array)

In [20]:
def generate_umap_variant(distances, n_neighbors, min_dist, feature_ids, class_assignments, title_string):
  reducer = umap.UMAP(n_components = 2, n_neighbors=50, min_dist=0.05, metric = "precomputed") # default min distance #metric='precomputed'
  embedding = reducer.fit_transform(distances)
  embedding_coordinates_table = pd.DataFrame({"feature_id" : feature_ids})
  embedding_coordinates_table["x"] = embedding[:, 0]
  embedding_coordinates_table["y"] = embedding[:, 1]
  embedding_coordinates_table["cluster"] = class_assignments
  dist_embedding = squareform(pdist(embedding, 'seuclidean'))
  spearman_score = np.array(spearmanr(distances.flat, dist_embedding.flat))[0]
  pearson_score = np.array(pearsonr(distances.flat, dist_embedding.flat))[0]
  print(f"Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}")
  figure  = plotly.express.scatter(
    data_frame= embedding_coordinates_table, x = "x", y = "y", hover_data=["feature_id"], color = "cluster",
    width=1000, height=800
  )
  figure.update_layout(
    title = f"{title_string} <br><sup>Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}</sup>",
    xaxis_title="X Coordinate Embedding",
    yaxis_title="Y Coordinate Embedding"
  )
  return figure

In [48]:
n_neighbors = 50; min_dist = 0.1; 
title = f"umap: n_neighbors = {n_neighbors}, min_dist = {min_dist}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_umap_variant(distances, n_neighbors, min_dist, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")


using precomputed metric; inverse_transform will be unavailable



Pearson Score: 0.559, Spearman Score: 0.548


In [49]:
n_neighbors = 30; min_dist = 0.1; 
title = f"umap: n_neighbors = {n_neighbors}, min_dist = {min_dist}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_umap_variant(distances, n_neighbors, min_dist, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")


using precomputed metric; inverse_transform will be unavailable



Pearson Score: 0.548, Spearman Score: 0.543


In [50]:
n_neighbors = 100; min_dist = 0.1; 
title = f"umap: n_neighbors = {n_neighbors}, min_dist = {min_dist}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_umap_variant(distances, n_neighbors, min_dist, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")


using precomputed metric; inverse_transform will be unavailable



Pearson Score: 0.565, Spearman Score: 0.55


In [51]:
n_neighbors = 1200; min_dist = 0.1; 
title = f"umap: n_neighbors = {n_neighbors}, min_dist = {min_dist}"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_umap_variant(distances, n_neighbors, min_dist, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")


using precomputed metric; inverse_transform will be unavailable



Pearson Score: 0.548, Spearman Score: 0.535


# MDS Variant

In [25]:
def generate_mds_variant(distances, feature_ids, class_assignments, title_string):
  reducer = MDS(n_components=2, normalized_stress='auto', dissimilarity='precomputed')
  embedding = reducer.fit_transform(distances)
  embedding_coordinates_table = pd.DataFrame({"feature_id" : feature_ids})
  embedding_coordinates_table["x"] = embedding[:, 0]
  embedding_coordinates_table["y"] = embedding[:, 1]
  embedding_coordinates_table["cluster"] = class_assignments

  dist_embedding = squareform(pdist(embedding, 'seuclidean'))
  spearman_score = np.array(spearmanr(distances.flat, dist_embedding.flat))[0]
  pearson_score = np.array(pearsonr(distances.flat, dist_embedding.flat))[0]
  print(f"Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}")
  figure  = plotly.express.scatter(
    data_frame= embedding_coordinates_table, x = "x", y = "y", hover_data=["feature_id"], color = "cluster",
    width=1000, height=800
  )
  figure.update_layout(
    title = f"{title_string} <br><sup>Pearson Score: {round(pearson_score, 3)}, Spearman Score: {round(spearman_score, 3)}</sup>",
    xaxis_title="X Coordinate Embedding",
    yaxis_title="Y Coordinate Embedding"
  )
  return figure

In [52]:
title = f"MDS variant"
filename = os.path.join(output_directory, title + ".html")
feature_ids = extract_feature_ids_from_spectra(pipeline.spectra_matchms)
class_assignments = [str(cluster) for cluster in  pipeline.kmedoid_grid[3].cluster_assignments] # iloc3 --> k = 20
visualized_classes = select_random_subset_of_classes(class_assignments, 20)
figure = generate_mds_variant(distances, feature_ids, visualized_classes,  title)
figure.show()
figure.write_html(filename)
figure.write_image(os.path.join(output_directory, title + ".svg"), width = "1000", height="800")

Pearson Score: 0.779, Spearman Score: 0.814
