# ECM UMAP and clustering
This notebook shows performs UMAP & clustering on PH features (combined from dim-0 and dim-1 PH) of ECM. This notebook uses Python.

### Data
* 401 ROIs.
* We only include 396 ROIs whose TDA features were nontrivial.

### Features
* For each ECM image, we generate two TDA features, namely, PD0 and PD1.
    * PD0: dimension-0 persistence diagram.  
    * PD1: dimension-1 persistence diagram. 
* PD0 and PD1 are converted to PI0 and PI1.
    * PI0: dimension-0 persistence image. Vector of length 20
    * PI1: dimension-1 persistence image. Array of size 20 x 20.
* We flatten PI1 into a vector of length 400 and concatenate with PI0. This results in a vector of length 420.
* The features array (of size 396, 420) is saved in: `analysis/ECM/combined_UMAP_clusters/ECM_topological_features.csv`
* The ROI that corresponds to each index is saved in `analysis/ECM/combined_UMAP_clusters/ECM_PI01_idx_ROI.csv`

In [None]:
import h5py
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import mplcursors
import seaborn as sns
import umap
import hdbscan

In [None]:
%matplotlib widget

# 1. load data

In [None]:
# load umap coordinates
umap_coord = np.genfromtxt("analysis/ECM/combined_UMAP_clusters/ECM_dim01_umap.csv", delimiter = ",")

# load index_to_ROI
idx_ROI = np.loadtxt("analysis/ECM/combined_UMAP_clusters/ECM_PI01_idx_ROI.csv", delimiter = ",", dtype = str)

features_array = np.loadtxt("analysis/ECM/combined_UMAP_clusters/ECM_topological_features.csv", delimiter=",")
print("shape of features array: ", features_array.shape)

# load all data (in dataframe)
df = pd.read_csv("analysis/ECM/combined_UMAP_clusters/ECM_data.csv", index_col = None)

### generating `features_array` ###
# load raw data
# PD0 = dict()
# PD1 = dict()
# PI0 = dict()
# PI1 = dict()

# PD_dir = "TDA_features/persistence_diagrams/ECM/"
# PI_dir = "TDA_features/persistence_images/ECM/"

# for ROI in idx_ROI:
#     pd0 = np.genfromtxt(PD_dir + "PD0/" + ROI + ".csv", delimiter = ",")
#     pd1 = np.genfromtxt(PD_dir + "PD1/" + ROI + ".csv", delimiter = ",")    
#     pi0 = np.genfromtxt(PI_dir + "PI0/" + ROI + ".csv", delimiter = ",")
#     pi1 = np.genfromtxt(PI_dir + "PI1/" + ROI + ".csv", delimiter = ",")    
    
#     PD0[ROI] = pd0
#     PD1[ROI] = pd1
#     PI0[ROI] = pi0
#     PI1[ROI] = pi1

# prepare features array
# features = {ROI:np.concatenate([PI0[ROI], PI1[ROI].T.flatten()]) for ROI in idx_ROI}
# n_ROI = len(idx_ROI)
# n_features = len(features[idx_ROI[0]])
# features_array = np.empty((n_ROI, n_features))

# for i in range(n_ROI):
#     features_array[i,:] = features[idx_ROI[i]]
# np.savetxt("analysis/ECM/combined_UMAP_clusters/ECM_topological_features.csv", features_array, delimiter=",")

### generating the dataframe ###
# load cluster info from Iris's analysis
#cluster_indices = dict()
#with h5py.File("UMAP/ECM_cluster_indices.h5", "r") as f:
#    for k in f.keys():
#        cluster_indices[int(k)] = f[k][()]

# Above dataframe was prepared by:
# columns = ["umap1", "umap2", "cluster", "ROI"]
# df = pd.DataFrame(columns = columns)
# for i in range(n_ROI):
#     cluster = find_cluster(i, cluster_indices)
#     df = pd.concat([df, pd.DataFrame([[umap_coord[0,i], umap_coord[1,i], cluster, idx_ROI[i]]], columns = columns)])
# df.reset_index(inplace = True, drop = True)
# df.to_csv('analysis/ECM/combined_UMAP_clusters/ECM_data.csv', index=False)

In [None]:
features_array_mean = features_array - np.mean(features_array, axis = 0)

In [None]:
# to find the ROI corresponding to a specific index, use idx_ROI
idx = 10
print("ROI corresponding to index ", idx, ": ", idx_ROI[10])

# 2. UMAP

In [None]:
### generating `features_array` ###
# load raw data
PD0 = dict()
PD1 = dict()
PI0 = dict()
PI1 = dict()

PD_dir = "PH_features/persistence_diagrams/ECM/"
PI_dir = "PH_features/persistence_images/ECM/"

for ROI in idx_ROI:
    pd0 = np.genfromtxt(PD_dir + "PD0/" + ROI + ".csv", delimiter = ",")
    pd1 = np.genfromtxt(PD_dir + "PD1/" + ROI + ".csv", delimiter = ",")    
    pi0 = np.genfromtxt(PI_dir + "PI0/" + ROI + ".csv", delimiter = ",")
    pi1 = np.genfromtxt(PI_dir + "PI1/" + ROI + ".csv", delimiter = ",")    
    
    PD0[ROI] = pd0
    PD1[ROI] = pd1
    PI0[ROI] = pi0
    PI1[ROI] = pi1

# prepare features array
features = {ROI:np.concatenate([PI0[ROI], PI1[ROI].T.flatten()]) for ROI in idx_ROI}
n_ROI = len(idx_ROI)
n_features = len(features[idx_ROI[0]])
features_array = np.empty((n_ROI, n_features))

for i in range(n_ROI):
    features_array[i,:] = features[idx_ROI[i]]

print(features_array.shape)

features_array_mean = features_array - np.mean(features_array, axis = 0)

In [None]:
# UMAP 
presentable_embedding = umap.UMAP(
    n_neighbors=5,
    n_components=2,
    random_state=5,
).fit_transform(features_array_mean)

In [None]:
x = presentable_embedding[:,0]
y = presentable_embedding[:,1]
names = idx_ROI

norm = plt.Normalize(1,4)
cmap = plt.cm.RdYlGn

fig,ax = plt.subplots()
sc = plt.scatter(x,y,#c=c, 
                 s=10, norm=norm)

annot = ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
                    bbox=dict(boxstyle="round", fc="w"),
                    arrowprops=dict(arrowstyle="->"))
annot.set_visible(False)

def update_annot(ind):
    
    pos = sc.get_offsets()[ind["ind"][0]]
    annot.xy = pos
    text = "{}".format(" ".join(list(map(str,ind["ind"]))))
    annot.set_text(text)
    annot.get_bbox_patch().set_alpha(0.4)
    

def hover(event):
    vis = annot.get_visible()
    if event.inaxes == ax:
        cont, ind = sc.contains(event)
        if cont:
            update_annot(ind)
            annot.set_visible(True)
            fig.canvas.draw_idle()
        else:
            if vis:
                annot.set_visible(False)
                fig.canvas.draw_idle()

fig.canvas.mpl_connect("motion_notify_event", hover)

plt.show()

# 3. Clustering

In [None]:
# UMAP for clustering
clusterable_embedding = umap.UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.00001,
    random_state=5,
).fit_transform(features_array_mean)

# HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=8,min_samples=2,cluster_selection_epsilon=0.5)
cluster_labels = clusterer.fit_predict(clusterable_embedding)

In [None]:
# relabeling for plotting purposes
relabel = {-1 : -1,    # -1 is unassigned cluster. In the paper, we label it "cluster 9"
            1 : 0,
            3 : 1,
            7 : 2,
            6 : 3,
            0 :4,
            5 :5,
            4: 6,
            2: 7}
cluster_labels2 = [relabel[i] for i in cluster_labels]

In [None]:
colormaps = matplotlib.colors.ListedColormap([
    "#A9A9A9", 
    "#780000", 
    "#cb334c", 
    "#f89981", 
    "#ffbd00", 
    "#02c39a", 
    "#429bb4",  
    "#7851A9",
    "#32174D" ])
colormaps

In [None]:
columns = ["umap1", "umap2", "cluster", "ROI"]
df = pd.DataFrame(columns = columns)
for i in range(n_ROI):
    cluster = cluster_labels2[i]
    df = pd.concat([df, pd.DataFrame([[presentable_embedding[i,0], presentable_embedding[i,1], cluster, idx_ROI[i]]], columns = columns)])
df.reset_index(inplace = True, drop = True)

fig, ax = plt.subplots()
g =sns.scatterplot(x="umap1", y="umap2",
                  hue="cluster", 
                  palette=colormaps,
                   legend = "full",
                  data=df);
annot = ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
                    bbox=dict(boxstyle="round", fc="w"),
                    arrowprops=dict(arrowstyle="->"))
annot.set_visible(False)
plt.legend(loc = "upper right")
fig.canvas.mpl_connect("motion_notify_event", hover)
plt.show()

Example ROI indices from each cluster. We present thesse ROIs in the manuscript.
* cluster -1 (unassigned): 314, 146, 116, 278
* cluster 0 : 163, 37, 251, 296
* cluster 1: 58, 189, 164, 62
* cluster 2: 23, 208, 222, 158
* cluster 3: 71, 173, 225, 19
* cluster 4: 6, 345, 328, 104
* cluster 5: 75, 211, 114, 41
* cluster 6: 125, 40, 176, 194
* cluster 7:250, 52, 213, 56




In [None]:
# save embedding coordinates and cluster indices
# np.savetxt("analysis/ECM/combined_UMAP_clusters/presentable_embedding.csv", presentable_embedding, delimiter=",");
# np.savetxt("analysis/ECM/combined_UMAP_clusters/cluster_labels_python.csv", cluster_labels2, delimiter =", ")
 

# 4. Save one file with presentable embedding, ROI, corresponding cluster

In [None]:
# load umap coordinates
umap_coord = np.genfromtxt("analysis/ECM/combined_UMAP_clusters/presentable_embedding.csv", delimiter = ",")

# load index_to_ROI
idx_ROI = np.loadtxt("analysis/ECM/combined_UMAP_clusters/ECM_PI01_idx_ROI.csv", delimiter = ",", dtype = str)

# cluster labels
cluster_labels = np.loadtxt("analysis/ECM/combined_UMAP_clusters/cluster_labels_python.csv", delimiter = ",")

d = {"coord1": umap_coord[:,0], "coord2": umap_coord[:,1], "ROI": idx_ROI, "clusters": cluster_labels}
df = pd.DataFrame(data = d)

#df.to_csv("analysis/ECM/combined_UMAP_clusters/UMAP/coords_ROI_clusters.csv", index = False)