# Compare methods with pre-computed distances

* Affinity Propagation
* Agglomerative Clustering
* BIRCH
* DBSCAN
* K-Means
* Mini-Batch K-Means
* K-Medoids
* Mean Shift
* OPTICS
* Spectral Clustering
* Mixture of Gaussians

## Setup the packages

In [115]:
#%pip install seaborn

In [116]:
from sklearn.cluster import AffinityPropagation, DBSCAN, SpectralClustering, Birch, KMeans, AgglomerativeClustering
from sklearn.cluster import OPTICS, MiniBatchKMeans, estimate_bandwidth, MeanShift
from sklearn_extra.cluster import KMedoids
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.spatial.distance import mahalanobis
from itertools import product
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns

## Setup the data

In [117]:
npaths = 4
path = "../TestData/paths%d/"%npaths
beta_df     = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
print(beta_crop)
print(col1, col2)

     Trait1     Trait2
1         1   2.086726
2         2   6.120570
3         3   6.886463
4         4   9.754806
5         5  12.277511
..      ...        ...
196      46  43.173467
197      47  44.186414
198      48  44.284154
199      49  46.801117
200      50  47.091551

[200 rows x 2 columns]
Trait1 Trait2


## Plotting function

In [118]:
def chartclusters(data,title,colorvar,tooltip, palette = None, clustgroups = None):
    col1 = data.columns[0]
    col2 = data.columns[1]
    if palette is not None:
        chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
            x = col1,
            y = col2,
            color = alt.Color(colorvar, scale=alt.Scale(domain=clustgroups, range=palette)),
            tooltip = tooltip
        ).interactive()
    else:
        chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
            x = col1,
            y = col2,
            color = colorvar,
            tooltip = tooltip
        ).interactive()

    return(chart)

# Clustering

## Compute the cosine-similarites

In [119]:
beta_crop = beta_df.loc[:,[col1,col2]]
similarities = cosine_similarity(beta_crop)
print('pairwise dense output:\n {}\n'.format(similarities))
sims = (1 - similarities).clip(min=0)
results = beta_crop.copy()

pairwise dense output:
 [[1.         0.99142441 0.99935064 ... 0.92912529 0.93538109 0.93288335]
 [0.99142441 1.         0.99548931 ... 0.87283604 0.88114527 0.87781464]
 [0.99935064 0.99548931 1.         ... 0.91519863 0.92203134 0.91929966]
 ...
 [0.92912529 0.87283604 0.91519863 ... 1.         0.99985045 0.99994699]
 [0.93538109 0.88114527 0.92203134 ... 0.99985045 1.         0.99997551]
 [0.93288335 0.87781464 0.91929966 ... 0.99994699 0.99997551 1.        ]]



## Cluster with Affinity Propagation

In [120]:
affinity = AffinityPropagation(random_state=0, affinity = "precomputed").fit(sims)
aff_lab = "affinity"
results[aff_lab] =affinity.labels_

## Cluster with Agglomerative Clustering

In [121]:
AggClust = AgglomerativeClustering(metric = "precomputed", linkage = "average").fit(sims)
agg_lab = 'Agglomerative'
results[agg_lab] =AggClust.labels_

## Cluster with DBscan

In [122]:
# DBSCAN
eps = 0.001
dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims)
dblab = 'dbscan%d'%eps
db_labs = dbscan.labels_
results[dblab] = db_labs

## Cluster with k-means


In [123]:
# K-Means
nclust = npaths
#beta_crop = beta_df.loc[:,[col1,col2]]
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(sims)
k_lab = 'kmeans%d'%nclust
results[k_lab] = kmeans.labels_

## Cluster with mini-Batch K-means

In [124]:
kmeans_batch = MiniBatchKMeans(n_clusters=nclust,
                               random_state=0,
                               batch_size=30,
                               max_iter=10,
                               n_init="auto").fit(sims)
km_lab = 'kmeans_minibatch%d'%nclust
results[km_lab] = kmeans_batch.labels_

## Cluster with k-medoids

In [125]:
nclust = npaths
kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims)
kmed_lab = 'kmedoids%d'%nclust
results[kmed_lab] = kmedoids.labels_

## Cluster with spectral

In [126]:
spectral = SpectralClustering(n_clusters=5,
                              assign_labels='discretize',
                              eigen_solver="arpack",
                              affinity = "precomputed").fit(similarities)
sp_lab = 'spectral'
results[sp_lab] = spectral.labels_

## Cluster with OPTICS

In [127]:
op_clust = OPTICS(min_samples=5, metric = "precomputed").fit(similarities)
op_lab = 'optics'
op_labs = op_clust.labels_
results[op_lab] = op_labs

## Cluster with Gaussian Mixture Model

In [128]:
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0, init_params = "random_from_data").fit(sims)
gm_lab = "mixture"
results[gm_lab] = gmm.predict(sims)

### Plot the clustering methods with predefined distances

In [129]:
tooltip1 =[aff_lab, agg_lab, dblab, kmed_lab, km_lab, k_lab, sp_lab, op_lab]
# Create a palette to ensure the junk and noise clusters are grey.
clust_groups = pd.unique(results[tooltip1].values.ravel('K'))
ncolours = len(clust_groups)
palette = sns.color_palette(None, ncolours).as_hex()
junk_ind = np.where(clust_groups == -1)
palette[junk_ind[0][0]] = '#808080'


In [130]:

affini = chartclusters(results,"Affinity Propagation", aff_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
agglom = chartclusters(results,"Agglomerative Clustering", agg_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
dbscan = chartclusters(results,"DBSCAN", dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
kmedoi = chartclusters(results,"K-Medoids",kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
kmean = chartclusters(results, "K-means", k_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
kminb = chartclusters(results,"K-means mini-batch",km_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
spectr = chartclusters(results,"Spectral",sp_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
gauss = chartclusters(results,"Gaussian Mixture Model",gm_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
optics = chartclusters(results,"OPTICS",op_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)


In [131]:
# Display charts
(affini | agglom) & (dbscan | kmedoi) &( kmean | kminb) & (spectr | gauss) & optics

## Repeat with messier data

So far the test data all goes through the origin. Let's repeat the tests with different intercepts

In [132]:
npaths = 4
path = "../TestData_moveintercept/paths%d/"%npaths
beta_df_new     = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df_new       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df_new     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df_new   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df_new,
           "se": se_df_new,
           "pval": pval_df_new}
col1 = beta_df_new.columns[0]
col2 = beta_df_new.columns[1]
beta_crop_new = beta_df_new.loc[:,[col1,col2]]

### Compute cosine-similarities

In [133]:
similarities_new = cosine_similarity(beta_crop_new)
print('pairwise dense output:\n {}\n'.format(similarities_new))
sims_new = (1 - similarities_new).clip(min=0)
results_new = beta_crop_new.copy()

pairwise dense output:
 [[ 1.         -0.92853614  0.38442312 ...  0.70821861  0.56472748
   0.67074752]
 [-0.92853614  1.         -0.01423609 ... -0.39551224 -0.21799221
  -0.3474684 ]
 [ 0.38442312 -0.01423609  1.         ...  0.92399818  0.97895496
   0.94254326]
 ...
 [ 0.70821861 -0.39551224  0.92399818 ...  1.          0.98259078
   0.99866098]
 [ 0.56472748 -0.21799221  0.97895496 ...  0.98259078  1.
   0.99088609]
 [ 0.67074752 -0.3474684   0.94254326 ...  0.99866098  0.99088609
   1.        ]]



## Cluster with affinity propagation

In [134]:
affinity = AffinityPropagation(random_state=0, affinity = "precomputed").fit(sims_new)
aff_lab = "affinity"
results_new[aff_lab] =affinity.labels_

## Cluster with agglomertive clustering

In [135]:
AggClust = AgglomerativeClustering(metric = "precomputed", linkage = "average").fit(sims_new)
agg_lab = 'Agglomerative'
results_new[agg_lab] =AggClust.labels_

### Cluster with K-medoids

In [136]:
nclust = npaths
kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims_new)
kmed_lab = 'kmedoids%d'%nclust
results_new[kmed_lab] = kmedoids.labels_

## Cluster with DBSCAN

In [137]:
# DBSCAN
eps = 0.001
dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims_new)
dblab = 'dbscan%d'%eps
db_labs = dbscan.labels_
results_new[dblab] = db_labs

## Cluster with k-means

In [138]:
nclust = npaths
#beta_crop = beta_df.loc[:,[col1,col2]]
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(sims_new)
k_lab = 'kmeans%d'%nclust
results_new[k_lab] = kmeans.labels_

## Cluster with mini-batch k-means

In [139]:
kmeans_batch = MiniBatchKMeans(n_clusters=nclust,
                               random_state=0,
                               batch_size=30,
                               max_iter=10,
                               n_init="auto").fit(sims_new)
km_lab = 'kmeans_minibatch%d'%nclust
results_new[km_lab] = kmeans_batch.labels_

## Cluster with Spectral

In [140]:
spectral = SpectralClustering(n_clusters=5,
                              assign_labels='discretize',
                              eigen_solver="arpack",
                              affinity = "precomputed").fit(abs(similarities_new))
sp_lab = 'spectral'
results_new[sp_lab] = spectral.labels_

## Cluster with OPTICS

In [141]:
op_clust = OPTICS(min_samples=2, metric = "precomputed").fit(abs(similarities_new))
op_lab = 'optics'
op_labs = op_clust.labels_
results_new[op_lab] = op_labs

## Cluster with Gaussian Micture Model

In [142]:
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0, init_params = "random_from_data").fit(sims_new)
gm_lab = "mixture"
results_new[gm_lab] = gmm.predict(sims_new)

### Plot

In [143]:
tooltip1 =[aff_lab, agg_lab, dblab, kmed_lab, k_lab, km_lab, #sp_lab, 
           op_lab, gm_lab]
# Create a palette to ensure the junk and noise clusters are grey.
clust_groups = pd.unique(results_new[tooltip1].values.ravel('K'))
ncolours = len(clust_groups)
palette = sns.color_palette(None, ncolours).as_hex()
junk_ind = np.where(clust_groups == -1)
palette[junk_ind[0][0]] = '#808080'

In [144]:
affini_new = chartclusters(results_new,"Affinity Propagation", aff_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
agglom_new = chartclusters(results_new,"Agglomerative Clustering", agg_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
dbscan_new = chartclusters(results_new,"DBSCAN", dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
kmedoi_new = chartclusters(results_new,"K-Medoids",kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
kmean_new = chartclusters(results_new, "K-means", k_lab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
kminb_new = chartclusters(results_new,"K-means mini-batch",km_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
spectr_new = chartclusters(results_new,"Spectral",sp_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
optics_new = chartclusters(results_new,"OPTICS",op_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
gauss_new = chartclusters(results_new,"Gaussian Mixture Model",gm_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)

In [145]:
(affini_new | agglom_new) & (dbscan_new | kmedoi_new) & (kmean_new | kminb_new) & (spectr_new | optics_new) & gauss_new

# Compare to using a different metric. - Mahalanobis distance

Details on the Mahalanobis distance can be found here: https://www.statisticshowto.com/mahalanobis-distance/. The metric compares points in a multivariate space.

If the dataset has the covariance matrix $C$ then the distance between the points $x_A$ and $x_B$ using the mahalanobis distance is $$ d (x_B, x_A) = \sqrt{(x_B – x_A)^T * C^{-1} * (x_B – x_A)}$$

## Compute Mahalanobis distance

In [146]:
def mahalanobis_matrix(x=None):
    # Compute the inverse covariance matrix
    cov_mat = np.linalg.inv(np.cov(x, rowvar=False))
    # Create empty square array with dimension given by the number of rows in x
    nr = len(x.index)
    maha_arr = np.zeros((nr,nr))
    # For each pair of rows compute the mahalonbis distance
    for i, j in product(x.index, x.index):
        u = x.loc[i]
        v = x.loc[j]
        maha = mahalanobis(u,v, cov_mat)
        maha_arr[i-1,j-1] = maha
    return maha_arr

In [147]:
sim_maha = mahalanobis_matrix(beta_crop_new)
print('pairwise dense output:\n {}\n'.format(sim_maha))
results_new_maha = beta_crop_new.copy()

pairwise dense output:
 [[0.         0.38806569 0.18379352 ... 3.30737716 3.33486682 3.43297146]
 [0.38806569 0.         0.26975322 ... 3.33371912 3.327163   3.4489545 ]
 [0.18379352 0.26975322 0.         ... 3.19756856 3.21235022 3.31937251]
 ...
 [3.30737716 3.33371912 3.19756856 ... 0.         0.29204939 0.15070936]
 [3.33486682 3.327163   3.21235022 ... 0.29204939 0.         0.23417183]
 [3.43297146 3.4489545  3.31937251 ... 0.15070936 0.23417183 0.        ]]



## Cluster with Affinity Propagation

In [None]:
affinity = AffinityPropagation(random_state=0, affinity = "precomputed").fit(sims)
aff_lab = "affinity"
results[aff_lab] =affinity.labels_

## Cluster with Agglomerative Clustering

In [None]:
AggClust = AgglomerativeClustering(metric = "precomputed", linkage = "average").fit(sim_maha)
agg_lab = 'Agglomerative'
results_new_maha[agg_lab] =AggClust.labels_

## Cluster with DBSCAN

In [149]:
# DBSCAN
dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sim_maha)
dblab = 'dbscan%d'%eps
db_labs = dbscan.labels_
results_new_maha[dblab] = db_labs

## Cluster with k-means

In [None]:
# K-Means
nclust = npaths
#beta_crop = beta_df.loc[:,[col1,col2]]
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(sim_maha)
k_lab = 'kmeans%d'%nclust
results_new_maha[k_lab] = kmeans.labels_

## Cluster with k-means mini-batch

In [None]:
kmeans_batch = MiniBatchKMeans(n_clusters=nclust,
                               random_state=0,
                               batch_size=30,
                               max_iter=10,
                               n_init="auto").fit(sim_maha)
km_lab = 'kmeans_minibatch%d'%nclust
results_new_maha[km_lab] = kmeans_batch.labels_

## Cluster with K-medoids

In [148]:
nclust = npaths
kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sim_maha)
kmed_lab = 'kmedoids%d'%nclust
results_new_maha[kmed_lab] = kmedoids.labels_

## Cluster with Optics

In [None]:
op_clust = OPTICS(min_samples=5, metric = "precomputed").fit(sim_maha)
op_lab = 'optics'
op_labs = op_clust.labels_
results_new_maha[op_lab] = op_labs

## Cluster with spectral

In [None]:
spectral = SpectralClustering(n_clusters=5,
                              assign_labels='discretize',
                              eigen_solver="arpack",
                              affinity = "precomputed").fit(sim_maha)
sp_lab = 'spectral'
results_new_maha[sp_lab] = spectral.labels_

## Cluster with Gaussian Mixture Model

In [150]:
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0, init_params = "random_from_data").fit(sim_maha)
gm_lab = "mixture"
results_new_maha[gm_lab] = gmm.predict(sim_maha)

## Plot the clusters with the new distance metric

In [151]:
tooltip1 =[dblab, kmed_lab, gm_lab,aff_lab, agg_lab]
# Create a palette to ensure the junk and noise clusters are grey.
clust_groups = pd.unique(results[tooltip1].values.ravel('K'))
ncolours = len(clust_groups)
palette = sns.color_palette(None, ncolours).as_hex()
junk_ind = np.where(clust_groups == -1)
palette[junk_ind[0][0]] = '#808080'


In [None]:
dbscan_maha = chartclusters(results_new_maha,"DBSCAN maha", dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
kmedoi_maha = chartclusters(results_new_maha,"K-Medoids maha",kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
gauss_maha = chartclusters(results_new_maha,"Gaussian Mixture Model",gm_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
affini_maha = chartclusters(results_new_maha,"Affinity Propagation",aff_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
agglom_maha = chartclusters(results_new_maha,"Agglomerative",gm_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
kmeans_maha = chartclusters(results_new_maha,"K-means",k_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
kminib_maha = chartclusters(results_new_maha,"Mini-batch",km_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
optics_maha = chartclusters(results_new_maha,"Optics",op_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
specte_maha = chartclusters(results_new_maha,"Specteral",sp_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)



In [152]:
(affini_maha | agglom_maha) & (dbscan_maha | kmedoi_maha) & gauss_maha

## Plots the same clustering methods with different metrics

In [153]:
(kmedoi_maha | kmedoi_new) & (dbscan_maha | dbscan_new)

# Methods which don't take in precomputed metric.

## Cluster with Birch

In [154]:
# Birch
#beta_crop = beta_df.loc[:,[col1,col2]]
brc = Birch(n_clusters=None, threshold=0.01, branching_factor=50).fit(sims)
b_lab = 'birch'
results[b_lab] = brc.predict(sims)

## Cluster with Mean-Shift

In [155]:
bandwidth = estimate_bandwidth(beta_crop, quantile=0.2, n_samples=500)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(sims)
ms_lab = "Mean-Shift"
results[ms_lab] = ms.labels_

### Plot the clustering methods without predefined distances.

In [156]:
tooltip =[b_lab, ms_lab ]
birch = chartclusters(results,"Birch", b_lab+":N", tooltip)
means = chartclusters(results,"Mean-shift",ms_lab+":N",tooltip)

In [157]:
(birch | means)

In [158]:
birch