# Testing Clustering Methods

In [313]:
from sklearn.cluster import DBSCAN, SpectralClustering, Birch, KMeans
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
import pandas as pd
import numpy as np
import altair as alt

## Setup test data

In [314]:
npaths = 4
path = "../TestData/paths%d/"%npaths
beta_df     = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
print(col1, col2)

Trait1 Trait2


## Plotting functions

In [315]:

def chartclusters(data,title,colorvar,tooltip):
    chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
        x = col1,
        y = col2,
        color = colorvar,
        tooltip = tooltip
    ).interactive()

    return(chart)

### Cluster with dbscan

In [316]:
results = beta_df
# DBSCAN
eps_list = [0.001, 0.01, 0.1, 0.4]
eps_labels = ['dbscan%d'%i for i in range(4)]
dbscan_charts = {}
for i, eps in enumerate(eps_list):
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "cosine").fit(beta_df)
    collab = 'dbscan%d'%i
    results[collab] = dbscan.labels_
    tooltip = [collab, col1, col2]
    dbscan_charts[collab] = chartclusters(results,collab,collab+":N", tooltip)


In [317]:

# Display charts
display(Markdown("# Plots with increaseing eps"))
(dbscan_charts[eps_labels[0]] | dbscan_charts[eps_labels[1]]) & (dbscan_charts[eps_labels[2]] | dbscan_charts[eps_labels[3]])


# Plots with increaseing eps

### Cluster with k-means

In [318]:
# K-Means
nclust = 5
kmeans = KMeans(n_clusters= nclust, random_state=0, n_init="auto").fit(beta_df)
klab = 'kmeans%d'%nclust
results[klab] = kmeans.labels_

### Cluster with GMM

In [319]:
# GMM
gmm = GaussianMixture(n_components=5, covariance_type="diag", random_state=0).fit(beta_df)
results['mixture'] = gmm.predict(beta_df)


### Cluster with Birch

In [320]:

# Birch
brc = Birch(n_clusters=None, threshold=0.01, branching_factor=50).fit(beta_df)
results['birch'] = brc.predict(beta_df)


### Cluster with Spectral

In [321]:

# Spectral
spectral = SpectralClustering(n_clusters=5, assign_labels='discretize', eigen_solver="arpack").fit(beta_df)
results['spectral'] = spectral.labels_

[6.56543790e-16 1.42568748e-06 1.07474900e-06 7.33738016e-06
 2.27307337e-06 1.47644044e-05]
not reaching the requested tolerance 2.9802322387695312e-06.
Use iteration 1577 instead with accuracy 
2.689054926504999e-06.

  _, diffusion_map = lobpcg(
[1.56656884e-15 1.27051489e-06 8.34719684e-07 7.51626461e-06
 1.30934241e-06 5.20349774e-06]
not reaching the requested tolerance 2.9802322387695312e-06.
  _, diffusion_map = lobpcg(


## Plot remaining cluster methods

In [322]:

# Create plots for different clustering methods
tooltip =[eps_labels[3],klab,'mixture','spectral','birch']
kmeans = chartclusters(results,"K-Means",klab+":N",tooltip)
gmm = chartclusters(results,"GMM","mixture:N",tooltip)
spectral = chartclusters(results,"Spectral","spectral:N",tooltip)
birch = chartclusters(results,"Birch","birch:N",tooltip)

In [323]:
# Display charts
display(Markdown("# Plots with increaseing eps"))
(dbscan_charts[eps_labels[1]] | kmeans) & (gmm | spectral) & birch


# Plots with increaseing eps