In [131]:
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import msfeastPipeline as msfeast
import os
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, optimal_leaf_ordering
from scipy.cluster import hierarchy
from sklearn.manifold import MDS
from sklearn.manifold import Isomap, TSNE
import umap

In [132]:
test_data_directory = os.path.join("tmp_outputs", "test_data_large")
output_data_directory = os.path.join("tmp_outputs", "output_cluster_testing")
filepath_similarity_array = os.path.join("tmp_outputs", "output_cluster_testing", "similarity_array.npy")
filepath_test_spectra = os.path.join(test_data_directory, "test_spectra.mgf")

## Get Similarity Array for large data

In [133]:
pipeline = msfeast.Msfeast()
pipeline.attach_spectral_data_from_file(filepath_test_spectra, identifier_key="scans")  
if False:
  # Avoid rerunning expensive similarity computations over and over
  pipeline.run_spectral_similarity_computations("ModifiedCosine")
  np.save(filepath_similarity_array, pipeline.similarity_array, allow_pickle=False)
similarity_array = np.load(filepath_similarity_array)
pipeline.similarity_array = similarity_array

## Create & investiage embedding approaches
dim reduction & manifold learning

**PCA**

A weird butt shape with many points centered in one dense group, and all the rest spread out far around a ridge

In [None]:
# Compute the distances for hierarchical clustering
distances = msfeast._convert_similarity_to_distance(similarity_array)
# Define embedding model
pca_embedding = MDS(n_components=2, normalized_stress='auto', dissimilarity="precomputed")
pca_coordinates = pca_embedding.fit_transform(distances)


In [None]:
data_table = pd.DataFrame({"x":pca_coordinates[:,0], "y":pca_coordinates[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='PCA Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

**ISOMAP**

very little impact from n-neighbors; going from 5 to 1000 leads to rather similar stuff. 
some similarity to the PCA results with the densely packed central ridge and a spread out

In [None]:
# Compute the distances for hierarchical clustering
distances = msfeast._convert_similarity_to_distance(similarity_array)
# Define embedding model
isomap_embedding = Isomap(n_components=2, metric="precomputed") #, n_neighbors=5
isomap_coordinates = isomap_embedding.fit_transform(distances)

In [None]:
data_table = pd.DataFrame({"x":isomap_coordinates[:,0], "y":isomap_coordinates[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='Isomap Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

**TSNE**

--> the PCA precomputation init does fairly little of any obvious impact compared to all defaults.

--> early exaggeration doesn't seem to impact things much, at least not compared to perplexity which is the more meaningful setting anyways. Similar for learning rate. 

--> going for very high perplexity seems to get one closer to the PCA results (1000+) ; the weird butt shape starts to appear

In [None]:
# Compute the distances for hierarchical clustering
distances = msfeast._convert_similarity_to_distance(similarity_array)
# Define embedding model
#tsne_embedding = TSNE(n_components=2, metric="precomputed", init = pca_coordinates) # <-- MDS takes quite a while...
tsne_embedding = TSNE(n_components=2, metric="precomputed", init="random", perplexity=30) # early_exaggeration=12, learning_rate = 100
tsne_coordinates30 = tsne_embedding.fit_transform(distances)

tsne_embedding = TSNE(n_components=2, metric="precomputed", init="random", perplexity=100) # early_exaggeration=12, learning_rate = 100
tsne_coordinates100 = tsne_embedding.fit_transform(distances)

tsne_embedding = TSNE(n_components=2, metric="precomputed", init="random", perplexity=1500) # early_exaggeration=12, learning_rate = 100
tsne_coordinates1500 = tsne_embedding.fit_transform(distances)

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates30[:,0], "y":tsne_coordinates30[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='TSNE 30 Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates100[:,0], "y":tsne_coordinates100[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='TSNE 100 Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates1500[:,0], "y":tsne_coordinates1500[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='TSNE 1500 Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

**UMAP**

--> n_neighbours : number of neighbors considered, can be used to force a local or global view. Changing it to very high leads to very strange artefacts. Unlike tsne, it does not seem to approach the MDS results

--> min_dist: controls the distance between elements, between 0 and 1, low values allow tight packing, while high values force points away from one another


In [None]:
# Compute the distances for hierarchical clustering
distances = msfeast._convert_similarity_to_distance(similarity_array)
# Define embedding model
umap_embedding = umap.UMAP(n_components=2, metric="precomputed") #  n_neighbors=100 min_dist =0.2
umap_coordinates = umap_embedding.fit_transform(distances)

In [None]:
data_table = pd.DataFrame({"x":umap_coordinates[:,0], "y":umap_coordinates[:,1]})
fig = px.scatter(
  data_table, x="x", y="y",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='UMAP Embedding of Similarity Data'
)
fig.update_traces(marker={'size': 3})
fig.show()

## Investigate Clustering Performance

k-medoid clustering
hierarchcical clustering
dbscan
optics
knn

https://scikit-learn.org/stable/modules/clustering.html#clustering-performance-evaluation

**Calinski-Harabasz Index**

- the calinski harabasz score does not seem to support arbitrary distance matrices
- the following runs, but probably is nonsense

```{python}
sklearn.metrics.calinski_harabasz_score(distances, output, metric = "precomputed")
```

**Davies-Bouldin Index**

Also does not appear to support arbitrary distance matrices. It also computes using the labels.
The scale is difficult to interpret.

```{python}
sklearn.metrics.davies_bouldin_score(distances, output) # metric="precomputed"
```

https://scikit-learn-extra.readthedocs.io/en/stable/modules/cluster.html
https://scikit-learn-extra.readthedocs.io/en/stable/auto_examples/cluster/plot_kmedoids_digits.html#sphx-glr-auto-examples-cluster-plot-kmedoids-digits-py
https://scikit-learn-extra.readthedocs.io/en/stable/auto_examples/cluster/plot_commonnn_data_sets.html#sphx-glr-auto-examples-cluster-plot-commonnn-data-sets-py
https://scikit-learn-extra.readthedocs.io/en/stable/auto_examples/cluster/plot_clustering.html#sphx-glr-auto-examples-cluster-plot-clustering-py



## Setting up default visual background for cluster vis

In [None]:
distances = msfeast._convert_similarity_to_distance(similarity_array)
tsne_embedding = TSNE(n_components=2, metric="precomputed", init="random", perplexity=30) # early_exaggeration=12, learning_rate = 100
tsne_coordinates = tsne_embedding.fit_transform(distances)

# Alternative Clustering Approaches


For small numbers of clusters, agglomerative clustering produces very small clusters alongside one very large cluster. It's results are thus utterly useless when seeking few clusters.

In [135]:
import sklearn.cluster
distances = msfeast._convert_similarity_to_distance(similarity_array)
hclust_setting = sklearn.cluster.AgglomerativeClustering(metric = "precomputed", n_clusters = 1000, linkage="average")
output = hclust_setting.fit_predict(distances) 

2.388323268840259

In [None]:
sklearn.metrics.calinski_harabasz_score(X, labels)

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates[:,0], "y":tsne_coordinates[:,1]})
data_table["h-clust"] = np.array(output, dtype = str)
fig = px.scatter(
  data_table, x="x", y="y", color="h-clust",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With Hier. Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3})
fig.show()

**OPTICS**

Optics creates a undetermined amount of clusters. The min_samples parameter defines how many neighbors a point must have in close proximity to be considered as a cluster center, cluster centers are expanded from these points. Low values lead to more clusters. The minimum is two, leading to about 580 clusters. There is still a massive cluster of size -1.

In [None]:
from sklearn.cluster import OPTICS, cluster_optics_dbscan
distances = msfeast._convert_similarity_to_distance(similarity_array)
optics_clust = OPTICS(min_samples=2, metric="precomputed")
optics_out = optics_clust.fit_predict(distances)
np.unique(optics_out).size

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates[:,0], "y":tsne_coordinates[:,1]})
data_table["clust"] = np.array(optics_out, dtype = str)
fig = px.scatter(
  data_table, x="x", y="y", color="clust",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With Optics Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3, 'opacity':0.7})
fig.show()

**HDBSCAN**

Leads to lower numbers of clusters even at minimal cluster sizes of 2. Large chunks of the clustering space will be left as noise (singleton).
HDBscan also seems to mess with the distances matrix... 
Seems completely broken

In [None]:
#distances = msfeast._convert_similarity_to_distance(similarity_array)
#hdbscan = sklearn.cluster.HDBSCAN(min_cluster_size=10, metric = "precomputed")
#hdbscan_out = hdbscan.fit_predict(distances)
#np.unique(hdbscan_out).size
#distances = msfeast._convert_similarity_to_distance(similarity_array)
#silhouette_score =  sklearn.metrics.silhouette_score(
#    X = distances, 
#    labels = hdbscan_out, 
#    metric= "precomputed"
#)
#print(silhouette_score)

**Spectral Clustering using adjacency matrix**

The impact of the cutoff seems to be great, as is the impact of the number of clusters. There is also interaction between these two settings.

Spectral clustering seems to agree very well with t-SNE.
However, like many other methods, spectral clustering tends to find some very large clusters of diffusely connected features.

Spectral clustering quickly becomes slower for larger n_clusters

In [None]:
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters = 30, affinity='precomputed', assign_labels='discretize')
cutoff = 0.8
adjacency_matrix = np.where(similarity_array >= cutoff, similarity_array, 0)
assignments = sc.fit_predict(adjacency_matrix)  
np.unique(assignments).size

In [None]:
silhouette_score =  sklearn.metrics.silhouette_score(
    X = distances, 
    labels = assignments, 
    metric= "precomputed"
)
silhouette_score

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates[:,0], "y":tsne_coordinates[:,1]})
data_table["clust"] = np.array(assignments, dtype = str)
fig = px.scatter(
  data_table, x="x", y="y", color="clust",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With Spectral Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3, 'opacity':0.7})
fig.show()

In [165]:
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters = 100, affinity='precomputed', assign_labels='discretize')
assignments = sc.fit_predict(similarity_array)
np.unique(assignments).size

100

In [166]:
silhouette_score =  sklearn.metrics.silhouette_score(
    X = distances, 
    labels = assignments, 
    metric= "precomputed"
)
silhouette_score

0.2814662294490354

In [167]:
cluster_specific = sklearn.metrics.silhouette_samples(
    X = distances, 
    labels = assignments, 
    metric= "precomputed"
)
px.histogram(pd.DataFrame({"silhouette": cluster_specific}), x="silhouette")

In [None]:
data_table = pd.DataFrame({"x":tsne_coordinates[:,0], "y":tsne_coordinates[:,1]})
data_table["clust"] = np.array(assignments, dtype = str)
fig = px.scatter(
  data_table, x="x", y="y", color="clust",
  color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_traces(textposition='top center')
fig.update_layout(
  height=800, 
  width =1000, 
  title_text='t-SNE Embedding of Similarity Data With Spectral Clusters',
  xaxis_title = "t-sne embedding x coordinate",
  yaxis_title = "t-sne embedding y coordinate")
fig.update_traces(marker={'size': 3, 'opacity':0.7})
fig.show()

In [None]:
from kmedoids import KMedoids
import sklearn.cluster

In [None]:
distances = msfeast._convert_similarity_to_distance(similarity_array)
cutoff = 0.4
distances = np.where(distances <= cutoff, distances, 1)
scores = []
for n_clust in range(5, 2500, 100):
  hclust_setting = sklearn.cluster.AgglomerativeClustering(
    metric = "precomputed", n_clusters = n_clust, linkage="average"
  )
  output = hclust_setting.fit_predict(distances) 
  cluster = KMedoids(
    n_clusters=n_clust, 
    metric='precomputed', 
    random_state=0, 
    method = "fasterpam"
  )  
  cluster_assignments = cluster.fit_predict(distances)
  cluster_assignments_strings = [
      "km_" + str(elem) 
      for elem in cluster_assignments
  ]
  score_kmedoid =  sklearn.metrics.silhouette_score(
      X = distances, 
      labels = cluster_assignments_strings, 
      metric= "precomputed"
  )
  score_hclust = sklearn.metrics.silhouette_score(distances, output, metric='precomputed')
  scores.append((n_clust, score_hclust, score_kmedoid))

In [None]:
import pandas as pd
df = pd.DataFrame.from_records(scores)
fig1 = px.scatter(df, x = 0, y = 1)
fig2 = px.scatter(df, x = 0, y = 2)
fig1.show()
fig2.show()