# Testing Clustering Methods

In [1]:
#%pip install scikit-learn
#%pip install scikit-learn-extra
#%pip install altair

In [2]:
from sklearn.cluster import DBSCAN, SpectralClustering, Birch, KMeans
from sklearn_extra.cluster import KMedoids
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
import numpy as np
import altair as alt

## Setup test data

In [3]:
npaths = 4
path = "../TestData/paths%d/"%npaths
beta_df     = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
print(beta_crop)
print(col1, col2)

     Trait1     Trait2
1         1   0.877742
2         2   2.245622
3         3   1.943377
4         4   4.142426
5         5   4.588771
..      ...        ...
196      46  70.897001
197      47  71.615175
198      48  74.331159
199      49  74.911972
200      50  77.495756

[200 rows x 2 columns]
Trait1 Trait2


## Plotting functions

In [4]:

def chartclusters(data,title,colorvar,tooltip):
    chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
        x = col1,
        y = col2,
        color = colorvar,
        tooltip = tooltip
    ).interactive()

    return(chart)

### Cluster with dbscan

In [5]:
results = beta_crop
# DBSCAN
eps_list = [0.001, 0.01, 0.1, 0.4]
eps_labels = ['dbscan%d'%i for i in range(4)]
dbscan_charts = {}
for i, eps in enumerate(eps_list):
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "cosine").fit(beta_crop)
    collab = 'dbscan%d'%i
    results[collab] = dbscan.labels_
    tooltip = [collab, col1, col2]
    dbscan_charts[collab] = chartclusters(results,collab,collab+":N", tooltip)


In [6]:

# Display charts
display(Markdown("# Plots with increaseing eps"))
(dbscan_charts[eps_labels[0]] | dbscan_charts[eps_labels[1]]) & (dbscan_charts[eps_labels[2]] | dbscan_charts[eps_labels[3]])


# Plots with increaseing eps

### Cluster with k-means

In [7]:
# K-Means
nclust = 3
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(beta_crop)
klab = 'kmeans%d'%nclust
results[klab] = kmeans.labels_

### Cluster with GMM

In [8]:
# GMM
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0).fit(beta_crop)
results['mixture'] = gmm.predict(beta_crop)


### Cluster with Birch

In [9]:

# Birch
brc = Birch(n_clusters=None, threshold=0.01, branching_factor=50).fit(beta_crop)
results['birch'] = brc.predict(beta_crop)


### Cluster with Spectral

In [10]:

# Spectral
spectral = SpectralClustering(n_clusters=5, assign_labels='discretize', eigen_solver="arpack").fit(beta_crop)
results['spectral'] = spectral.labels_

[2.87109902e-12 4.74434374e-07 6.42740982e-07 1.19893912e-06
 1.22871603e-06 1.31000395e-05]
not reaching the requested tolerance 2.9802322387695312e-06.
Use iteration 1298 instead with accuracy 
2.2279137555347394e-06.

  _, diffusion_map = lobpcg(
[5.46573885e-14 5.16864470e-07 7.10073937e-07 1.37183751e-06
 1.36059898e-06 9.40810438e-06]
not reaching the requested tolerance 2.9802322387695312e-06.
  _, diffusion_map = lobpcg(


### Cluster with k-medoids - Euclidean

I suspect the cosine-similarity metric isn't being called within k-medoids. Test with Euclidean to see if the results are the same.

In [11]:
kmedoids_e = KMedoids(n_clusters=nclust, metric = "euclidean", random_state = 0).fit(beta_crop)
results['kmedoids_e'] = kmedoids_e.labels_

### Cluster with k-medoids - Cosine

In [12]:
similarities = cosine_similarity(beta_crop)
print('pairwise dense output:\n {}\n'.format(similarities))


pairwise dense output:
 [[1.         0.99992218 0.99978178 ... 0.70278909 0.70281314 0.69629085]
 [0.99992218 1.         0.9998896  ... 0.71156403 0.71159142 0.70514744]
 [0.99978178 0.9998896  1.         ... 0.71222265 0.71228849 0.70583835]
 ...
 [0.70278909 0.71156403 0.71222265 ... 1.         0.99999095 0.99995582]
 [0.70281314 0.71159142 0.71228849 ... 0.99999095 1.         0.99994695]
 [0.69629085 0.70514744 0.70583835 ... 0.99995582 0.99994695 1.        ]]



In [13]:
kmedoids_cp = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(similarities)
results['kmedoids_cp'] = kmedoids_cp.labels_



In [14]:
kmedoids_c = KMedoids(n_clusters=nclust, metric = "cosine", random_state = 0).fit(similarities)
results['kmedoids_c'] = kmedoids_c.labels_

## Plot remaining cluster methods

In [15]:

# Create plots for different clustering methods
tooltip =[eps_labels[3],klab,'mixture','spectral','birch','kmedoids_e','kmedoids_c','kmedoids_cp']
kmeans = chartclusters(results,"K-Means",klab+":N",tooltip)
gmm = chartclusters(results,"GMM","mixture:N",tooltip)
spectral = chartclusters(results,"Spectral","spectral:N",tooltip)
birch = chartclusters(results,"Birch","birch:N",tooltip)
kmedoids_e = chartclusters(results,"K-Medoids Euclidean","kmedoids_e:N",tooltip)
kmedoids_c = chartclusters(results,"K-Medoids Cosine","kmedoids_c:N",tooltip)
kmedoids_cp = chartclusters(results,"K-Medoids Cosine Precomputed","kmedoids_cp:N",tooltip)

In [16]:
# Display charts
display(Markdown("# Plots for the main clustering methods"))
(dbscan_charts[eps_labels[1]] | kmeans) & (kmedoids_e | kmedoids_c) & kmedoids_cp


# Plots for the main clustering methods

K-Medoids is not behaving as we expect but it does not appear that the metric is using the default as the results are different.

In [17]:
# Display charts
display(Markdown("# Plots for other methods"))
(gmm | spectral) & birch

# Plots for other methods

### Realistic data