# Testing Clustering Methods

In [1]:
#%pip install scikit-learn
#%pip install scikit-learn-extra
#%pip install altair

In [2]:
from sklearn.cluster import DBSCAN, SpectralClustering, Birch, KMeans
from sklearn_extra.cluster import KMedoids
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
import numpy as np
import altair as alt

## Setup test data

In [3]:
npaths = 4
path = "../TestData/paths%d/"%npaths
beta_df     = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
print(beta_crop)
print(col1, col2)

     Trait1     Trait2
1         1   2.086726
2         2   6.120570
3         3   6.886463
4         4   9.754806
5         5  12.277511
..      ...        ...
196      46  43.173467
197      47  44.186414
198      48  44.284154
199      49  46.801117
200      50  47.091551

[200 rows x 2 columns]
Trait1 Trait2


## Plotting functions

In [4]:

def chartclusters(data,title,colorvar,tooltip):
    col1 = data.columns[0]
    col2 = data.columns[1]
    chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
        x = col1,
        y = col2,
        color = colorvar,
        tooltip = tooltip
    ).interactive()

    return(chart)

### Cluster with dbscan

In [5]:
results = beta_crop.copy()
# DBSCAN
eps_list = [0.001, 0.0025, 0.005, 0.01, 0.1, 0.4]
eps_labels = ['dbscan%d'%i for i in range(len(eps_list))]
dbscan_charts = {}
for i, eps in enumerate(eps_list):
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "cosine").fit(beta_crop)
    collab = 'dbscan%d'%i
    results[collab] = dbscan.labels_
    tooltip = eps_labels
    dbscan_charts[collab] = chartclusters(results,collab,collab+":N", tooltip)


In [6]:

# Display charts
display(Markdown("# Plots with increaseing eps"))
(dbscan_charts[eps_labels[0]] | dbscan_charts[eps_labels[1]]) & (dbscan_charts[eps_labels[2]] | dbscan_charts[eps_labels[3]]) & dbscan_charts[eps_labels[4]]


# Plots with increaseing eps

### Cluster with k-means

In [7]:
# K-Means
nclust = npaths
#beta_crop = beta_df.loc[:,[col1,col2]]
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(beta_crop)
klab = 'kmeans%d'%nclust
results[klab] = kmeans.labels_



### Cluster with GMM

In [8]:
# GMM
#beta_crop = beta_df.loc[:,[col1,col2]]
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0).fit(beta_crop)
results['mixture'] = gmm.predict(beta_crop)




### Cluster with Birch

In [9]:

# Birch
#beta_crop = beta_df.loc[:,[col1,col2]]
brc = Birch(n_clusters=None, threshold=0.01, branching_factor=50).fit(beta_crop)
results['birch'] = brc.predict(beta_crop)


### Cluster with Spectral

In [10]:

# Spectral
#beta_crop = beta_df.loc[:,[col1,col2]]
spectral = SpectralClustering(n_clusters=5, assign_labels='discretize', eigen_solver="arpack").fit(beta_crop)
results['spectral'] = spectral.labels_

[2.76259034e-14 3.22644654e-07 1.50201419e-06 4.52345038e-06
 1.29795107e-05 1.07005888e-05]
not reaching the requested tolerance 2.9802322387695312e-06.
Use iteration 1568 instead with accuracy 
3.0090637936932647e-06.

  _, diffusion_map = lobpcg(
[2.73244444e-14 2.24456268e-07 1.61553933e-06 2.54835746e-06
 8.35435911e-06 5.31184349e-06]
not reaching the requested tolerance 2.9802322387695312e-06.
  _, diffusion_map = lobpcg(


### Cluster with k-medoids - Euclidean

I suspect the cosine-similarity metric isn't being called within k-medoids. Test with Euclidean to see if the results are the same.

In [11]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_e = KMedoids(n_clusters=nclust, metric = "euclidean", random_state = 0).fit(beta_crop)
results['kmedoids_e'] = kmedoids_e.labels_

### Cluster with k-medoids - Cosine

In [12]:
beta_crop = beta_df.loc[:,[col1,col2]]
similarities = cosine_similarity(beta_crop)
simi_one_min = (1 - similarities).clip(min=0)
print('pairwise dense output:\n {}\n'.format(simi_one_min))


pairwise dense output:
 [[0.00000000e+00 8.57559487e-03 6.49358225e-04 ... 7.08747107e-02
  6.46189083e-02 6.71166540e-02]
 [8.57559487e-03 0.00000000e+00 4.51068799e-03 ... 1.27163957e-01
  1.18854734e-01 1.22185363e-01]
 [6.49358225e-04 4.51068799e-03 1.11022302e-16 ... 8.48013678e-02
  7.79686610e-02 8.07003355e-02]
 ...
 [7.08747107e-02 1.27163957e-01 8.48013678e-02 ... 2.22044605e-16
  1.49554281e-04 5.30110501e-05]
 [6.46189083e-02 1.18854734e-01 7.79686610e-02 ... 1.49554281e-04
  0.00000000e+00 2.44874558e-05]
 [6.71166540e-02 1.22185363e-01 8.07003355e-02 ... 5.30110501e-05
  2.44874558e-05 0.00000000e+00]]



In [13]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_cp = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(simi_one_min)
results['kmedoids_cp'] = kmedoids_cp.labels_

In [14]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_c = KMedoids(n_clusters=nclust, metric = "cosine", random_state = 0).fit(beta_crop)
results['kmedoids_c'] = kmedoids_c.labels_

## Plot remaining cluster methods

In [15]:

# Create plots for different clustering methods
tooltip =[eps_labels[3],klab,'mixture','spectral','birch','kmedoids_e','kmedoids_c','kmedoids_cp']
kmeans = chartclusters(results,"K-Means",klab+":N",tooltip)
gmm = chartclusters(results,"GMM","mixture:N",tooltip)
spectral = chartclusters(results,"Spectral","spectral:N",tooltip)
birch = chartclusters(results,"Birch","birch:N",tooltip)
kmedoids_e = chartclusters(results,"K-Medoids Euclidean","kmedoids_e:N",tooltip)
kmedoids_c = chartclusters(results,"K-Medoids Cosine","kmedoids_c:N",tooltip)
kmedoids_cp = chartclusters(results,"K-Medoids Cosine Precomputed","kmedoids_cp:N",tooltip)

In [16]:
# Display charts
display(Markdown("# Plots for the main clustering methods"))
(dbscan_charts[eps_labels[1]] | kmeans) & (kmedoids_e | kmedoids_c) & kmedoids_cp


# Plots for the main clustering methods

K-Medoids with cosine as matric and with precomputed cosine don't match

In [17]:
# Display charts
display(Markdown("# Plots for other methods"))
(gmm | spectral) & birch

# Plots for other methods

### Copying TG code

In [18]:
# KMediods (cosine)
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedioids = KMedoids(n_clusters=3, metric = "cosine", random_state = 0).fit(beta_crop)
beta_crop['kmedioids'] = kmedioids.labels_

In [19]:
beta_crop = beta_crop.rename(columns={col1: "x", col2: "y"})
tooltip = ['kmedioids']
chartclusters(beta_crop,"KMedioids","kmedioids:N",tooltip=tooltip)

In [20]:
import sklearn
import sklearn_extra
print(f"sklearn: {sklearn.__version__}; sklearn_extra: {sklearn_extra.__version__}")

sklearn: 1.2.1; sklearn_extra: 0.3.0
