# Testing Clustering Methods

In [1]:
#%pip install scikit-learn
#%pip install scikit-learn-extra
#%pip install altair

In [2]:
from sklearn.cluster import DBSCAN, SpectralClustering, Birch, KMeans, MiniBatchKMeans
from sklearn_extra.cluster import KMedoids
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pandas as pd
import numpy as np
import altair as alt

## Get the MRClust data

In [3]:
path = "../MRClustData/"
data_df     = pd.read_csv(path + "DBP_CAD.csv", index_col = 0)
# Store the exposure and standard error in separate dataframes
beta_df = data_df[["bx", "by"]].copy()
se_df   = data_df[["bxse", "byse"]].copy()
# Rename each row by the rsid
beta_df.index = data_df["rsid"]
se_df.index = data_df["rsid"]
# Rename the columns to match the traits
traits  = ["DBP", "CAD"]
beta_df.columns = traits
se_df.columns = traits
# Drop the first row which contains the rsid label only
beta_df.drop(index=beta_df.index[0], axis=0, inplace=True)
se_df.drop(index=se_df.index[0], axis=0, inplace=True)

data_dic = {"beta": beta_df,
           "se": se_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
print(beta_crop)
print(col1, col2)

                DBP     CAD
rsid                       
rs17030613  -0.2359  0.0060
rs2932538    0.2370  0.0017
rs17367504   0.4715  0.0356
rs12405515   0.1617  0.0141
rs4245739    0.1364  0.0125
...             ...     ...
rs6557876    0.1966  0.0143
rs111245230  0.4000  0.0586
rs687621    -0.1327  0.0269
rs6271       0.3647  0.0359
rs76452347  -0.1691  0.0028

[118 rows x 2 columns]
DBP CAD


## Transform the data
Denote the data points $p=(x,y)$.
if $x<0, \text{transform}(p)\rightarrow p'=(-x,-y)$

In [4]:
neg_rows = beta_crop.loc[beta_crop[col1]<0].index
beta_crop.loc[neg_rows,col2]=-beta_crop.loc[neg_rows,col2]
beta_crop.loc[neg_rows,col1]=-beta_crop.loc[neg_rows,col1]

## Compute the cosine similarities

In [5]:
similarities = cosine_similarity(beta_crop)
print('pairwise dense output:\n {}\n'.format(similarities))
sims = (1 - similarities).clip(min=0)
results = beta_crop.copy()

pairwise dense output:
 [[1.         0.99946861 0.99492501 ... 0.98480067 0.9923774  0.99996064]
 [0.99946861 1.         0.99767611 ... 0.97861578 0.99586707 0.99971847]
 [0.99492501 0.99767611 1.         ... 0.96232644 0.999741   0.99577856]
 ...
 [0.98480067 0.97861578 0.96232644 ... 1.         0.95588929 0.98322092]
 [0.9923774  0.99586707 0.999741   ... 0.95588929 1.         0.99343172]
 [0.99996064 0.99971847 0.99577856 ... 0.98322092 0.99343172 1.        ]]



## Plotting functions

In [6]:

def chartclusters(data,title,colorvar,tooltip):
    col1 = data.columns[0]
    col2 = data.columns[1]
    chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
        x = col1,
        y = col2,
        color = colorvar,
        tooltip = tooltip
    ).interactive()

    return(chart)

### Cluster with dbscan

In [7]:
# DBSCAN
eps_list = [1E-6,1E-5, 1E-4, 1E-3,1E-2, 1E-1]
eps_labels = ['dbscan%d'%i for i in range(len(eps_list))]
dbscan_charts = {}
for i, eps in enumerate(eps_list):
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims)
    collab = 'dbscan%d'%i
    results[collab] = dbscan.labels_
    tooltip = eps_labels
    dbscan_charts[collab] = chartclusters(results,collab,collab+":N", tooltip)


In [8]:

# Display charts
display(Markdown("# Plots with increaseing eps"))
(dbscan_charts[eps_labels[0]] | dbscan_charts[eps_labels[1]]) & (dbscan_charts[eps_labels[2]] | dbscan_charts[eps_labels[3]]) & (dbscan_charts[eps_labels[4]] | dbscan_charts[eps_labels[5]])


# Plots with increaseing eps

### Cluster with k-means

In [9]:
# K-Means
nclust = 4
#beta_crop = beta_df.loc[:,[col1,col2]]
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(sims)
klab = 'kmeans%d'%nclust
results[klab] = kmeans.labels_

### Cluster with GMM

In [10]:
# GMM
#beta_crop = beta_df.loc[:,[col1,col2]]
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0).fit(sims)
results['mixture'] = gmm.predict(sims)


### Cluster with Birch

In [11]:

# Birch
#beta_crop = beta_df.loc[:,[col1,col2]]
thresh_list = [1E-2,2.5E-2, 5E-2, 1E-1, 0.25, 0.5]
birch_labels = ['birch_%d'%i for i in range(len(thresh_list))]
birch_charts = {}
for i, thresh in enumerate(thresh_list):
    brc = Birch(n_clusters=None, threshold=thresh, branching_factor=50).fit(sims)
    collab = birch_labels[i]
    tooltip = birch_labels
    results[collab] = brc.predict(sims)
    birch_charts[collab] = chartclusters(results, collab, collab+":N", tooltip)



In [12]:
# Display charts
display(Markdown("# Plots with increaseing threshold"))
(birch_charts[birch_labels[0]] | birch_charts[birch_labels[1]]) & (birch_charts[birch_labels[2]] | birch_charts[birch_labels[3]]) & (birch_charts[birch_labels[4]] | birch_charts[birch_labels[5]])


# Plots with increaseing threshold

### Cluster with Spectral

In [13]:

# Spectral
#beta_crop = beta_df.loc[:,[col1,col2]]
spectral = SpectralClustering(n_clusters=5, 
                              assign_labels='discretize', 
                              eigen_solver="arpack",
                              affinity= "precomputed").fit(similarities)
results['spectral'] = spectral.labels_

## Cluster with k-means mini-batch

In [14]:
kmeans_batch = MiniBatchKMeans(n_clusters=nclust,
                               random_state=0,
                               batch_size=30,
                               max_iter=10,
                               n_init="auto").fit(sims)
km_lab = 'kmeans_minibatch%d'%nclust
results[km_lab] = kmeans_batch.labels_

### Cluster with k-medoids - Euclidean

I suspect the cosine-similarity metric isn't being called within k-medoids. Test with Euclidean to see if the results are the same.

In [15]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_e = KMedoids(n_clusters=nclust, metric = "euclidean", random_state = 0).fit(beta_crop)
results['kmedoids_e'] = kmedoids_e.labels_

### Cluster with k-medoids - Cosine

In [16]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_cp = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims)
results['kmedoids_cp'] = kmedoids_cp.labels_

In [17]:
#beta_crop = beta_df.loc[:,[col1,col2]]
kmedoids_c = KMedoids(n_clusters=nclust, metric = "cosine", random_state = 0).fit(beta_crop)
results['kmedoids_c'] = kmedoids_c.labels_

## Plot remaining cluster methods

In [18]:

# Create plots for different clustering methods
tooltip =[eps_labels[0],eps_labels[1],klab,'mixture','spectral',birch_labels[2],'kmedoids_e','kmedoids_c','kmedoids_cp', km_lab]
kmeans = chartclusters(results,"K-Means",klab+":N",tooltip)
gmm = chartclusters(results,"GMM","mixture:N",tooltip)
spectral = chartclusters(results,"Spectral","spectral:N",tooltip)
kmedoids_e = chartclusters(results,"K-Medoids Euclidean","kmedoids_e:N",tooltip)
kmedoids_c = chartclusters(results,"K-Medoids Cosine","kmedoids_c:N",tooltip)
kmedoids_cp = chartclusters(results,"K-Medoids Cosine Precomputed","kmedoids_cp:N",tooltip)
kmeans_mini = chartclusters(results, "K-means minibatch", km_lab+":N", tooltip)

In [19]:
# Display charts
display(Markdown("# Plots for the main clustering methods"))
(dbscan_charts[eps_labels[0]] | dbscan_charts[eps_labels[1]] ) & (birch_charts[birch_labels[2]] | kmeans) & (gmm | spectral) & (kmedoids_e | kmedoids_c) & kmedoids_cp


# Plots for the main clustering methods

K-Medoids with cosine as matric and with precomputed cosine don't match