# Compare methods with pre-computed distances

* Affinity Propagation
* Agglomerative Clustering
* BIRCH
* DBSCAN
* K-Means
* Mini-Batch K-Means
* K-Medoids
* Mean Shift
* OPTICS
* Spectral Clustering
* Mixture of Gaussians

## Setup the packages

In [54]:
#%pip install seaborn

In [55]:
from sklearn.cluster import AffinityPropagation, DBSCAN, SpectralClustering, Birch, KMeans, AgglomerativeClustering
from sklearn.cluster import OPTICS, MiniBatchKMeans, estimate_bandwidth, MeanShift
from sklearn_extra.cluster import KMedoids
from IPython.display import display, Markdown
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.spatial.distance import mahalanobis
from itertools import product
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns

## Plotting Function

In [56]:
def chartclusters(data,title,colorvar,tooltip, palette = None, clustgroups = None):
    col1 = data.columns[0]
    col2 = data.columns[1]
    if palette is not None:
        chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
            x = col1,
            y = col2,
            color = alt.Color(colorvar, scale=alt.Scale(domain=clustgroups, range=palette)),
            tooltip = tooltip
        ).interactive()
    else:
        chart = alt.Chart(data, title=title).mark_circle(size=60).encode(
            x = col1,
            y = col2,
            color = colorvar,
            tooltip = tooltip
        ).interactive()

    return(chart)

## Function to translate the data

In [57]:
def translate_origin(new_origin, data_df):
  print(new_origin)
  new_data_set = data_df.copy(deep = True)
  for r in range(data_df.shape[0]):
    new_data_set.iloc[r]-= new_origin
  new_data_df = pd.DataFrame(new_data_set)
  new_data_df.columns = data_df.columns
  new_data_df.index = data_df.index
  return(new_data_df)

## Function to compute the cosine similarities

In [58]:
def cos_sim(mat):
    similarities = cosine_similarity(mat)
    sims = (1 - similarities).clip(min=0)
    cos_dict = {"cos_sim": similarities,
                "cos_dist": sims}
    return(cos_dict)


# Radial Pathways
## Setup the data

In [59]:
npaths = 3
path = "../TestData_radial/paths%d/"%npaths
beta_df    = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
se_crop = se_df.loc[:,[col1,col2]]
pval_crop = pval_df.loc[:,[col1,col2]]
beta_mat = beta_crop.to_numpy()
se_mat = se_crop.to_numpy()
pval_mat = pval_crop.to_numpy()
data_arr = {"beta": beta_mat,
            "se": se_mat,
            "pval": pval_mat}

## Computer equally spaced origins

In [60]:
ncols = beta_mat.shape[1]
nchop = 10
niter = 3*ncols
max_cols = beta_mat.max(axis=0)
min_cols = beta_mat.min(axis=0)
y = np.linspace(min_cols[1], max_cols[1], nchop)
x = 0.0

### Iterate through moving the origin

In [61]:
i = 8
dbscan_plots = {}
kmedoids_plots = {}
#results_new = beta_crop.copy()
db_labs = []
kmed_labs =[]
for i in range(0,9):
    new_origin = np.array([x, y[i]])
    new_data_set = translate_origin(new_origin, beta_crop)
    sims_new = cos_sim(new_data_set)
    results_new = new_data_set.copy()
    nclust = npaths
    kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims_new["cos_dist"])
    kmed_lab = 'kmedoids%d_y%d'%(nclust,i)
    kmed_labs += [kmed_lab]

    results_new[kmed_lab] = kmedoids.labels_
    eps = 0.001
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims_new["cos_dist"])
    dblab = 'dbscan_eps%.3d_y%d'%(1e+3*eps,i)
    results_new[dblab] = dbscan.labels_
    db_labs += [dblab]

    tooltip1 =[dblab, kmed_lab]
    # Create a palette to ensure the junk and noise clusters are grey.
    clust_groups = pd.unique(results_new[tooltip1].values.ravel('K'))
    ncolours = len(clust_groups)
    palette = sns.color_palette(None, ncolours).as_hex()
    if -1 in clust_groups:
        junk_ind = np.where(clust_groups == -1)
        palette[junk_ind[0][0]] = '#808080'
    dbscan_new = chartclusters(results_new,dblab, dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
    kmedoi_new = chartclusters(results_new,kmed_lab,kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
    dbscan_plots[dblab] = dbscan_new
    kmedoids_plots[kmed_lab] = kmedoi_new


[   0.         -104.97277276]
[  0.         -81.96054795]
[  0.         -58.94832314]
[  0.         -35.93609833]


[  0.         -12.92387352]
[ 0.         10.08835129]
[ 0.        33.1005761]
[ 0.         56.11280091]
[ 0.         79.12502571]


## Plot the iteration for DBSCAN

In [62]:
(dbscan_plots[db_labs[0]] | dbscan_plots[db_labs[1]]) & (dbscan_plots[db_labs[2]] | dbscan_plots[db_labs[3]]) & (dbscan_plots[db_labs[4]] | dbscan_plots[db_labs[5]]) & (dbscan_plots[db_labs[6]] | dbscan_plots[db_labs[7]]) & (dbscan_plots[db_labs[8]])

## Plot the iteration for kmedoids

In [63]:
(kmedoids_plots[kmed_labs[0]] | kmedoids_plots[kmed_labs[1]]) & (kmedoids_plots[kmed_labs[2]] | kmedoids_plots[kmed_labs[3]]) & (kmedoids_plots[kmed_labs[4]] | kmedoids_plots[kmed_labs[5]]) & (kmedoids_plots[kmed_labs[6]] | kmedoids_plots[kmed_labs[7]]) & (kmedoids_plots[kmed_labs[8]])

# Parallel Pathways

In [64]:
npaths = 4
path = "../TestData_parallel/paths%d/"%npaths
beta_df    = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
se_crop = se_df.loc[:,[col1,col2]]
pval_crop = pval_df.loc[:,[col1,col2]]
beta_mat = beta_crop.to_numpy()
se_mat = se_crop.to_numpy()
pval_mat = pval_crop.to_numpy()
data_arr = {"beta": beta_mat,
            "se": se_mat,
            "pval": pval_mat}

## Computer equally spaced origins

In [65]:
ncols = beta_mat.shape[1]
nchop = 10
niter = 3*ncols
max_cols = beta_mat.max(axis=0)
min_cols = beta_mat.min(axis=0)
y = np.linspace(min_cols[1], max_cols[1], nchop)
x = 0.0

## Iterate through moving the origin

In [66]:
i = 8
dbscan_plots = {}
kmedoids_plots = {}
#results_new = beta_crop.copy()
db_labs = []
kmed_labs =[]
eps_opts = [1E-4, 1E-3, 2.5E-3, 5E-3, 1E-2, 2.5E-2, 5E-2]
for j in range(0,7):
    for i in range(0,9):
        new_origin = np.array([x, y[i]])
        new_data_set = translate_origin(new_origin, beta_crop)
        sims_new = cos_sim(new_data_set)
        results_new = new_data_set.copy()
        nclust = npaths
        kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims_new["cos_dist"])
        kmed_lab = 'kmedoids%d_y%d'%(nclust,i)
        kmed_labs += [kmed_lab]

        results_new[kmed_lab] = kmedoids.labels_
        eps = eps_opts[j]
        dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "euclidean").fit(beta_mat)
        #(eps = eps, min_samples = 2, metric = "precomputed").fit(sims_new["cos_dist"])
        dblab = 'dbscan_eps%.3d_y%d'%(1e+3*eps,i)
        results_new[dblab] = dbscan.labels_
        db_labs += [dblab]

        tooltip1 =[dblab, kmed_lab]
        # Create a palette to ensure the junk and noise clusters are grey.
        clust_groups = pd.unique(results_new[tooltip1].values.ravel('K'))
        ncolours = len(clust_groups)
        palette = sns.color_palette(None, ncolours).as_hex()
        if -1 in clust_groups:
            junk_ind = np.where(clust_groups == -1)
            palette[junk_ind[0][0]] = '#808080'
        dbscan_new = chartclusters(results_new,dblab, dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
        kmedoi_new = chartclusters(results_new,kmed_lab,kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
        dbscan_plots[dblab] = dbscan_new
        kmedoids_plots[kmed_lab] = kmedoi_new


[0.         1.51682524]
[ 0.         23.47903211]
[ 0.         45.44123898]
[ 0.         67.40344585]
[ 0.         89.36565272]
[  0.        111.3278596]
[  0.         133.29006647]
[  0.         155.25227334]
[  0.         177.21448021]
[0.         1.51682524]
[ 0.         23.47903211]
[ 0.         45.44123898]
[ 0.         67.40344585]
[ 0.         89.36565272]
[  0.        111.3278596]
[  0.         133.29006647]
[  0.         155.25227334]
[  0.         177.21448021]
[0.         1.51682524]
[ 0.         23.47903211]
[ 0.         45.44123898]
[ 0.         67.40344585]
[ 0.         89.36565272]
[  0.        111.3278596]
[  0.         133.29006647]
[  0.         155.25227334]
[  0.         177.21448021]
[0.         1.51682524]
[ 0.         23.47903211]
[ 0.         45.44123898]
[ 0.         67.40344585]
[ 0.         89.36565272]
[  0.        111.3278596]
[  0.         133.29006647]
[  0.         155.25227334]
[  0.         177.21448021]
[0.         1.51682524]
[ 0.         23.47903211

## Plot the iteration for DBSCAN

In [67]:
print("Eps value %.4f"%eps_opts[0])

Eps value 0.0001


In [68]:
(dbscan_plots[db_labs[0]] | dbscan_plots[db_labs[1]]) & (dbscan_plots[db_labs[2]] | dbscan_plots[db_labs[3]]) & (dbscan_plots[db_labs[4]] | dbscan_plots[db_labs[5]]) & (dbscan_plots[db_labs[6]] | dbscan_plots[db_labs[7]]) & (dbscan_plots[db_labs[8]])

In [69]:
print("Eps value %.4f"%eps_opts[1])

Eps value 0.0010


In [70]:
(dbscan_plots[db_labs[9]] | dbscan_plots[db_labs[10]]) & (dbscan_plots[db_labs[11]] | dbscan_plots[db_labs[12]]) & (dbscan_plots[db_labs[13]] | dbscan_plots[db_labs[14]]) & (dbscan_plots[db_labs[15]] | dbscan_plots[db_labs[16]]) & (dbscan_plots[db_labs[17]])

In [71]:
print("Eps value %.4f"%eps_opts[2])

Eps value 0.0025


In [72]:
(dbscan_plots[db_labs[18]] | dbscan_plots[db_labs[19]]) & (dbscan_plots[db_labs[20]] | dbscan_plots[db_labs[21]]) & (dbscan_plots[db_labs[22]] | dbscan_plots[db_labs[23]]) & (dbscan_plots[db_labs[24]] | dbscan_plots[db_labs[25]]) & (dbscan_plots[db_labs[26]])

In [73]:
print("Eps value %.4f"%eps_opts[3])

Eps value 0.0050


In [74]:
(dbscan_plots[db_labs[27]] | dbscan_plots[db_labs[28]]) & (dbscan_plots[db_labs[29]] | dbscan_plots[db_labs[30]]) & (dbscan_plots[db_labs[31]] | dbscan_plots[db_labs[32]]) & (dbscan_plots[db_labs[33]] | dbscan_plots[db_labs[34]]) & (dbscan_plots[db_labs[35]])

In [75]:
print("Eps value %.4f"%eps_opts[4])

Eps value 0.0100


In [76]:
(dbscan_plots[db_labs[36]] | dbscan_plots[db_labs[37]]) & (dbscan_plots[db_labs[38]] | dbscan_plots[db_labs[39]]) & (dbscan_plots[db_labs[40]] | dbscan_plots[db_labs[41]]) & (dbscan_plots[db_labs[42]] | dbscan_plots[db_labs[43]]) & (dbscan_plots[db_labs[44]])

In [77]:
print("Eps value %.4f"%eps_opts[5])

Eps value 0.0250


In [78]:
(dbscan_plots[db_labs[45]] | dbscan_plots[db_labs[46]]) & (dbscan_plots[db_labs[47]] | dbscan_plots[db_labs[48]]) & (dbscan_plots[db_labs[49]] | dbscan_plots[db_labs[50]]) & (dbscan_plots[db_labs[51]] | dbscan_plots[db_labs[52]]) & (dbscan_plots[db_labs[53]])

In [79]:
print("Eps value %.4f"%eps_opts[6])

Eps value 0.0500


In [80]:
(dbscan_plots[db_labs[54]] | dbscan_plots[db_labs[55]]) & (dbscan_plots[db_labs[56]] | dbscan_plots[db_labs[57]]) & (dbscan_plots[db_labs[58]] | dbscan_plots[db_labs[59]]) & (dbscan_plots[db_labs[60]] | dbscan_plots[db_labs[61]]) & (dbscan_plots[db_labs[62]])

## Plot the iteration for kmedoids

In [81]:
(kmedoids_plots[kmed_labs[0]] | kmedoids_plots[kmed_labs[1]]) & (kmedoids_plots[kmed_labs[2]] | kmedoids_plots[kmed_labs[3]]) & (kmedoids_plots[kmed_labs[4]] | kmedoids_plots[kmed_labs[5]]) & (kmedoids_plots[kmed_labs[6]] | kmedoids_plots[kmed_labs[7]]) & (kmedoids_plots[kmed_labs[8]])

# Mixed Pathways

## Setup the data

In [82]:
npaths = 4
path = "../TestData_moveintercept/paths%d/"%npaths
beta_df    = pd.read_csv(path + "unstdBeta_df.csv", index_col = 0)
se_df       = pd.read_csv(path + "unstdSE_df.csv", index_col = 0)
pval_df     = pd.read_csv(path + "pval_df.csv", index_col = 0)
traits_df   = pd.read_csv(path + "trait_info_nfil.csv", index_col = 0)

data_df = {"beta": beta_df,
           "se": se_df,
           "pval": pval_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]
se_crop = se_df.loc[:,[col1,col2]]
pval_crop = pval_df.loc[:,[col1,col2]]
beta_mat = beta_crop.to_numpy()
se_mat = se_crop.to_numpy()
pval_mat = pval_crop.to_numpy()
data_arr = {"beta": beta_mat,
            "se": se_mat,
            "pval": pval_mat}

## Compute set of equally spaced origins

In [83]:
ncols = beta_mat.shape[1]
nchop = 10
niter = 3*ncols
max_cols = beta_mat.max(axis=0)
min_cols = beta_mat.min(axis=0)
y = np.linspace(min_cols[1], max_cols[1], nchop)
x = 0.0

## Iterate through moving the origin

In [84]:
i = 8
dbscan_plots = {}
kmedoids_plots = {}
#results_new = beta_crop.copy()
db_labs = []
kmed_labs =[]
for i in range(0,9):
    new_origin = np.array([x, y[i]])
    new_data_set = translate_origin(new_origin, beta_crop)
    sims_new = cos_sim(new_data_set)
    results_new = new_data_set.copy()
    nclust = npaths
    kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims_new["cos_dist"])
    kmed_lab = 'kmedoids%d_y%d'%(nclust,i)
    kmed_labs += [kmed_lab]

    results_new[kmed_lab] = kmedoids.labels_
    eps = 0.001
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims_new["cos_dist"])
    dblab = 'dbscan_eps%.3d_y%d'%(1e+3*eps,i)
    results_new[dblab] = dbscan.labels_
    db_labs += [dblab]

    tooltip1 =[dblab, kmed_lab]
    # Create a palette to ensure the junk and noise clusters are grey.
    clust_groups = pd.unique(results_new[tooltip1].values.ravel('K'))
    ncolours = len(clust_groups)
    palette = sns.color_palette(None, ncolours).as_hex()
    junk_ind = np.where(clust_groups == -1)
    palette[junk_ind[0][0]] = '#808080'
    dbscan_new = chartclusters(results_new,dblab, dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
    kmedoi_new = chartclusters(results_new,kmed_lab,kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
    dbscan_plots[dblab] = dbscan_new
    kmedoids_plots[kmed_lab] = kmedoi_new


[  0.         -35.11061218]
[  0.        -15.5538003]
[0.         4.00301159]
[ 0.         23.55982347]
[ 0.         43.11663536]
[ 0.         62.67344724]
[ 0.         82.23025913]
[  0.         101.78707101]
[  0.         121.34388289]


## Plot the iterations for DBSCAN

In [85]:
(dbscan_plots[db_labs[0]] | dbscan_plots[db_labs[1]]) & (dbscan_plots[db_labs[2]] | dbscan_plots[db_labs[3]]) & (dbscan_plots[db_labs[4]] | dbscan_plots[db_labs[5]]) & (dbscan_plots[db_labs[6]] | dbscan_plots[db_labs[7]]) & (dbscan_plots[db_labs[8]])

## Plot the iteration for Kmedoids

In [86]:
(kmedoids_plots[kmed_labs[0]] | kmedoids_plots[kmed_labs[1]]) & (kmedoids_plots[kmed_labs[2]] | kmedoids_plots[kmed_labs[3]]) & (kmedoids_plots[kmed_labs[4]] | kmedoids_plots[kmed_labs[5]]) & (kmedoids_plots[kmed_labs[6]] | kmedoids_plots[kmed_labs[7]]) & (kmedoids_plots[kmed_labs[8]])

# Using the MR-Clust data

In [87]:
path = "../MRClustData/"
data_df     = pd.read_csv(path + "DBP_CAD.csv", index_col = 0)
# Store the exposure and standard error in separate dataframes
beta_df = data_df[["bx", "by"]].copy()
se_df   = data_df[["bxse", "byse"]].copy()
# Rename each row by the rsid
beta_df.index = data_df["rsid"]
se_df.index = data_df["rsid"]
# Rename the columns to match the traits
traits  = ["DBP", "CAD"]
beta_df.columns = traits
se_df.columns = traits
# Drop the first row which contains the rsid label only
beta_df.drop(index=beta_df.index[0], axis=0, inplace=True)
se_df.drop(index=se_df.index[0], axis=0, inplace=True)

data_dic = {"beta": beta_df,
           "se": se_df}
col1 = beta_df.columns[0]
col2 = beta_df.columns[1]
beta_crop = beta_df.loc[:,[col1,col2]]


## Transform the data
Denote the data points $p=(x,y)$.
if $x<0, \text{transform}(p)\rightarrow p'=(-x,-y)$

In [88]:
neg_rows = beta_crop.loc[beta_crop[col1]<0].index
beta_crop.loc[neg_rows,col2]=-beta_crop.loc[neg_rows,col2]
beta_crop.loc[neg_rows,col1]=-beta_crop.loc[neg_rows,col1]

beta_mat = beta_crop.to_numpy()

## Setup equally spaced origins on the y-axis

In [89]:
ncols = beta_mat.shape[1]
nchop = 10
niter = 3*ncols
max_cols = beta_mat.max(axis=0)
min_cols = beta_mat.min(axis=0)
y = np.linspace(min_cols[1], max_cols[1], nchop)
x = 0.0

In [90]:
dbscan_plots = {}
kmedoids_plots = {}
#results_new = beta_crop.copy()
dblabs = []
kmed_labs =[]
for i in range(0,9):
    new_origin = np.array([x, y[i]])
    new_data_set = translate_origin(new_origin, beta_crop)
    results_new = new_data_set.copy()
    sims_new = cos_sim(new_data_set)
    nclust = npaths
    kmedoids = KMedoids(n_clusters=nclust, metric = "precomputed", random_state = 0).fit(sims_new["cos_dist"])
    kmed_lab = 'kmedoids%d_y%d'%(nclust,i)
    results_new[kmed_lab] = kmedoids.labels_
    kmed_labs += [kmed_lab]
    eps = 0.001
    dbscan = DBSCAN(eps = eps, min_samples = 2, metric = "precomputed").fit(sims_new["cos_dist"])
    dblab = 'dbscan_y%d'%i
    db_vals = dbscan.labels_
    results_new[dblab] = db_vals
    dblabs += [dblab]

    tooltip1 =[dblab, kmed_lab]
    # Create a palette to ensure the junk and noise clusters are grey.
    clust_groups = pd.unique(results_new[tooltip1].values.ravel('K'))
    ncolours = len(clust_groups)
    palette = sns.color_palette(None, ncolours).as_hex()
    if -1 in clust_groups:
        junk_ind = np.where(clust_groups == -1)
        palette[junk_ind[0][0]] = '#808080'
    dbscan_new = chartclusters(results_new,"DBSCAN", dblab+":N", tooltip1, palette = palette, clustgroups= clust_groups)
    kmedoi_new = chartclusters(results_new,"K-Medoids",kmed_lab+":N",tooltip1, palette = palette, clustgroups= clust_groups)
    dbscan_plots[dblab] = dbscan_new
    kmedoids_plots[kmed_lab] = kmedoi_new



[ 0.     -0.0554]
[ 0.         -0.03734444]
[ 0.         -0.01928889]
[ 0.         -0.00123333]
[0.         0.01682222]
[0.         0.03487778]
[0.         0.05293333]
[0.         0.07098889]
[0.         0.08904444]


In [91]:
print(dblabs)

['dbscan_y0', 'dbscan_y1', 'dbscan_y2', 'dbscan_y3', 'dbscan_y4', 'dbscan_y5', 'dbscan_y6', 'dbscan_y7', 'dbscan_y8']


### DBSCAN plots

In [92]:
(dbscan_plots[dblabs[0]] | dbscan_plots[dblabs[1]]) & (dbscan_plots[dblabs[2]] | dbscan_plots[dblabs[3]]) & (dbscan_plots[dblabs[4]] | dbscan_plots[dblabs[5]]) & (dbscan_plots[dblabs[6]] | dbscan_plots[dblabs[7]]) & dbscan_plots[dblabs[8]]

## Kmedoids plots

In [93]:
(kmedoids_plots[kmed_labs[0]] | kmedoids_plots[kmed_labs[1]]) & (kmedoids_plots[kmed_labs[2]] | kmedoids_plots[kmed_labs[3]]) & (kmedoids_plots[kmed_labs[4]] | kmedoids_plots[kmed_labs[5]]) & (kmedoids_plots[kmed_labs[6]] | kmedoids_plots[kmed_labs[7]]) & kmedoids_plots[kmed_labs[8]]