In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import msfeastPipeline as msfeast
import os

In [2]:
test_data_directory = "test_data_large"
output_data_directory = "output_mushroom_data"
filepath_test_spectra = os.path.join(test_data_directory, "test_spectra.mgf")
filepath_test_quant_table = os.path.join(test_data_directory, "test_quant_table.csv")
filepath_test_treat_table = os.path.join(test_data_directory, "test_treat_table.csv")

In [3]:
treat_table = pd.read_csv(filepath_test_treat_table)
quant_table = pd.read_csv(filepath_test_quant_table)
pipeline = msfeast.Msfeast()
pipeline.attach_spectral_data_from_file(filepath_test_spectra, identifier_key="scans")
pipeline.attach_quantification_table(quant_table)
pipeline.attach_treatment_table(treat_table)

In [4]:
pipeline.run_spectral_similarity_computations("ModifiedCosine")

In [5]:
pipeline.run_and_attach_kmedoid_grid([8,20,25,30,35,40,45,50,100,150,200, 250,300, 500, 1000, 1100, 1200, 1400])

Kmedoid grid results. Use to inform kmedoid classification selection ilocs.
    iloc     k  silhouette_score  random_seed_used
0      0     8          0.187362                 0
1      1    20          0.219239                 0
2      2    25          0.224960                 0
3      3    30          0.233435                 0
4      4    35          0.197751                 0
5      5    40          0.236389                 0
6      6    45          0.172695                 0
7      7    50          0.194021                 0
8      8   100          0.191175                 0
9      9   150          0.229118                 0
10    10   200          0.216376                 0
11    11   250          0.191403                 0
12    12   300          0.202645                 0
13    13   500          0.287208                 0
14    14  1000          0.273959                 0
15    15  1100          0.267391                 0
16    16  1200          0.293316                 0
17    

In [6]:
pipeline.run_and_attach_tsne_grid([10, 15, 20, 30, 40, 50, 75, 100, 200, 400, 600, 800])

T-sne grid results. Use to inform t-sne embedding selection.
    iloc  perplexity  pearson_score  spearman_score  random_seed_used
0      0          10       0.340426        0.301655                 0
1      1          15       0.400193        0.360843                 0
2      2          20       0.414627        0.379262                 0
3      3          30       0.429403        0.398009                 0
4      4          40       0.387945        0.357227                 0
5      5          50       0.407833        0.384776                 0
6      6          75       0.488215        0.476383                 0
7      7         100       0.509835        0.489439                 0
8      8         200       0.529037        0.509766                 0
9      9         400       0.617392        0.605229                 0
10    10         600       0.666133        0.660180                 0
11    11         800       0.683206        0.675283                 0


In [9]:
pipeline.select_kmedoid_settings(iloc = 8)
pipeline.select_tsne_settings(iloc = 0)

In [25]:
pipeline.run_r_testing_routine(output_data_directory, "r_output.json", top_k = 50)

[1] "Starting Routine log at " "2024-02-20 10:04:22"     
[1] "R Routine: run integration test..."
[1] "R Routine: Validating input file paths..."
[1] "R Routine: Loading required packages..."
[1] "R Routine: Reading input files..."
[1] "R Routine: running global test and fold change computations..."
[1] "R Routine: exporting globaltest and log fold change computations..."
[1] "R Routine: complete, file saved, exiting R session."


package ‘dplyr’ was built under R version 4.2.3 


In [26]:
pipeline.export_to_json_file(os.path.join(output_data_directory, "dashboard_data.json"))

Plot Similarity matrix

Plot distance matrix

In [7]:
import numpy as np
import plotly.express as px
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
from scipy.cluster import hierarchy
import copy
# Assume 'similarity_array' is your pairwise similarity array
similarity_array = copy.deepcopy(pipeline.similarity_array)
# Compute the distances for hierarchical clustering
distances = msfeast._convert_similarity_to_distance(similarity_array)

Compute t-SNE embedding

In [8]:
tsnedf = pipeline.embedding_coordinates_table
tsnedf["k-medoid cluster"] = pipeline.assignment_table["set_id"]
fig = px.scatter(
  tsnedf, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3})
fig.show()

TypeError: 'NoneType' object is not subscriptable

In [None]:
tsnedf = pipeline.embedding_coordinates_table
tsnedf["k-medoid cluster"] = pipeline.assignment_table["set_id"]
fig = px.scatter(
  tsnedf, x="x", y="y", color="k-medoid cluster",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With K-medoid Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3})
fig.show()

# Alternative Clustering Approaches


In [None]:
import sklearn.cluster
hclust_setting = sklearn.cluster.AgglomerativeClustering(metric = "precomputed", n_clusters = 400, linkage="average")
output = hclust_setting.fit_predict(distances) 


In [None]:
tsnedf = pipeline.embedding_coordinates_table
tsnedf["h-clust"] = np.array(output, dtype = str)
fig = px.scatter(
  tsnedf, x="x", y="y", color="h-clust",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With Hier. Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3})
fig.show()

In [12]:
from kmedoids import KMedoids
import sklearn.cluster

In [18]:
scores = []
for n_clust in range(5, 2500, 50):
  hclust_setting = sklearn.cluster.AgglomerativeClustering(
    metric = "precomputed", n_clusters = n_clust, linkage="average"
  )
  output = hclust_setting.fit_predict(distances) 
  cluster = KMedoids(
    n_clusters=n_clust, 
    metric='precomputed', 
    random_state=0, 
    method = "fasterpam"
  )  
  cluster_assignments = cluster.fit_predict(distances)
  cluster_assignments_strings = [
      "km_" + str(elem) 
      for elem in cluster_assignments
  ]
  score_kmedoid =  sklearn.metrics.silhouette_score(
      X = distances, 
      labels = cluster_assignments_strings, 
      metric= "precomputed"
  )
  score_hclust = sklearn.metrics.silhouette_score(distances, output, metric='precomputed')
  scores.append((n_clust, score_hclust, score_kmedoid))

In [19]:
import pandas as pd
df = pd.DataFrame.from_records(scores)
fig1 = px.scatter(df, x = 0, y = 1)
fig2 = px.scatter(df, x = 0, y = 2)
fig1.show()
fig2.show()