In [2]:
import time
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import adjusted_rand_score
from sklearn.manifold import spectral_embedding
from sklearn.mixture import GaussianMixtureIC

from graspologic.embed import AdjacencySpectralEmbed, LaplacianSpectralEmbed

In [3]:
# In this notebook we compare scikit-learn's spectral_embedding to our AdjacencySpectralEmbed and LaplacianSpectral Embed.
# We provide four cases, in which LSE performs the best in the first and third case and ASE performs the best in the second and fourth.

In [47]:
def calc_ari(n_verts, n_sims, B, embed_method, labels_true):
    rows = []

    for _ in tqdm(range(n_sims), desc="Simulations", position=1, leave=True):
        
        #make probability matrix from block matrix
        P = np.zeros((n_verts, n_verts))
        P[0:int(n_verts/2),0:int(n_verts/2)] = B[0, 0]
        P[int(n_verts/2):n_verts, int(n_verts/2):n_verts] = B[1, 1]
        P[0:int(n_verts/2), int(n_verts/2):n_verts] = B[0, 1]
        P[int(n_verts/2):n_verts, 0:int(n_verts/2)] = B[1, 0]

        #make latent position matrix
        U, S, V = np.linalg.svd(P)

        #sample half the points from U
        X = U[0:int(n_verts/2), 0:2] @ np.sqrt(np.diag(S[0:2]))

        #sample half the points from V^T
        Y = V.T[int(n_verts/2):n_verts, 0:2] @ np.sqrt(np.diag(S[0:2]))

        #concatenate the two matrices to get the full latent position matrix
        lat_mat = np.concatenate((X, Y), axis=0)

        #get k_neighbors graph from latent position matrix (k=sqrt(n))
        kn_graph = kneighbors_graph(lat_mat, n_neighbors=int(np.sqrt(n_verts)))
        kn_graph = kn_graph.toarray()

        #choose embedding method
        if embed_method == "ase":
            ase = AdjacencySpectralEmbed(n_components=2)
            Xhat, Yhat = ase.fit_transform(kn_graph)
        elif embed_method == "lse":
            lse = LaplacianSpectralEmbed(n_components=2)
            Xhat, Yhat = lse.fit_transform(kn_graph)
        elif embed_method == "sklearn":
            Xhat = spectral_embedding(kn_graph, n_components=2)


        #concatenate Xhat and Yhat if using ase or lse
        if embed_method == "ase" or embed_method == "lse":
            Xhat = np.concatenate((Xhat, Yhat), axis=1)

        #calculate the ARI score
        gm_ic = GaussianMixtureIC(min_components=2, max_components=2, covariance_type="all")
        labels_mclust = gm_ic.fit_predict(Xhat)
        ari = adjusted_rand_score(labels_true, labels_mclust)
        result = {
            "test": embed_method,
            "ari": ari
        }
        rows.append(result)


    results = pd.DataFrame(rows)
    return results

In [48]:
#case 1: LSE does best
n_verts_lse = 100
n_sims = 50
B_affinity = np.array([[0.050, 0.013], [0.013, 0.051]])
labels_lse = int(n_verts_lse/2) * [0] + int(n_verts_lse/2) * [1]

ari_aff_lse_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_affinity, embed_method = "lse", labels_true = labels_lse)
ari_aff_ase_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_affinity, embed_method = "ase", labels_true = labels_lse)
ari_aff_sklearn_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_affinity, embed_method = "sklearn", labels_true = labels_lse)
ari_aff_df = pd.concat([ari_aff_lse_df, ari_aff_ase_df, ari_aff_sklearn_df])

#groupby the means for each embedding method across the simulations
ari_aff_means = ari_aff_df.groupby(["test"]).mean()
ari_aff_means

Simulations: 100%|██████████| 50/50 [00:04<00:00, 10.18it/s]
Simulations: 100%|██████████| 50/50 [00:03<00:00, 13.96it/s]
Simulations: 100%|██████████| 50/50 [00:03<00:00, 12.89it/s]


Unnamed: 0_level_0,ari
test,Unnamed: 1_level_1
ase,0.498978
lse,0.751601
sklearn,0.641565


In [52]:
#case 2: ASE does best
n_verts_ase = 500
n_sims = 50
B_core = np.array([[0.011, 0.027], [0.027, 0.079]])
labels_ase = int(n_verts_ase/2) * [0] + int(n_verts_ase/2) * [1]

ari_core_ase_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_core, embed_method = "ase", labels_true = labels_ase)
ari_core_lse_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_core, embed_method = "lse", labels_true = labels_ase)
ari_core_sklearn_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_core, embed_method = "sklearn", labels_true = labels_ase)
ari_core_df = pd.concat([ari_core_ase_df, ari_core_lse_df, ari_core_sklearn_df])
ari_core_means = ari_core_df.groupby(["test"]).mean()
ari_core_means

Simulations: 100%|██████████| 50/50 [00:09<00:00,  5.19it/s]
Simulations: 100%|██████████| 50/50 [00:09<00:00,  5.38it/s]
Simulations: 100%|██████████| 50/50 [00:08<00:00,  6.06it/s]


Unnamed: 0_level_0,ari
test,Unnamed: 1_level_1
ase,0.724238
lse,0.00203
sklearn,0.33942


In [51]:
#case 2: LSE does best
n_verts_lse = 100
n_sims = 50
B_core = np.array([[0.011, 0.027], [0.027, 0.079]])
labels_lse = int(n_verts_lse/2) * [0] + int(n_verts_lse/2) * [1]

ari_aff_lse_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_core, embed_method = "lse", labels_true = labels_lse)
ari_aff_ase_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_core, embed_method = "ase", labels_true = labels_lse)
ari_aff_sklearn_df = calc_ari(n_verts = n_verts_lse, n_sims = n_sims, B = B_core, embed_method = "sklearn", labels_true = labels_lse)
ari_aff_df = pd.concat([ari_aff_lse_df, ari_aff_ase_df, ari_aff_sklearn_df])

#groupby the means for each embedding method across the simulations
ari_aff_means = ari_aff_df.groupby(["test"]).mean()
ari_aff_means

Simulations: 100%|██████████| 50/50 [00:04<00:00, 10.68it/s]
Simulations: 100%|██████████| 50/50 [00:03<00:00, 13.11it/s]
Simulations: 100%|██████████| 50/50 [00:03<00:00, 13.04it/s]


Unnamed: 0_level_0,ari
test,Unnamed: 1_level_1
ase,0.576058
lse,0.789817
sklearn,0.214142


In [53]:
#case 4: ASE does best
n_verts_ase = 500
n_sims = 50
B_affinity = np.array([[0.050, 0.013], [0.013, 0.051]])
labels_ase = int(n_verts_ase/2) * [0] + int(n_verts_ase/2) * [1]

ari_core_ase_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_affinity, embed_method = "ase", labels_true = labels_ase)
ari_core_lse_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_affinity, embed_method = "lse", labels_true = labels_ase)
ari_core_sklearn_df = calc_ari(n_verts = n_verts_ase, n_sims = n_sims, B = B_affinity, embed_method = "sklearn", labels_true = labels_ase)
ari_core_df = pd.concat([ari_core_ase_df, ari_core_lse_df, ari_core_sklearn_df])
ari_core_means = ari_core_df.groupby(["test"]).mean()
ari_core_means

Simulations: 100%|██████████| 50/50 [00:07<00:00,  6.43it/s]
Simulations: 100%|██████████| 50/50 [00:07<00:00,  6.80it/s]
Simulations: 100%|██████████| 50/50 [00:06<00:00,  7.60it/s]


Unnamed: 0_level_0,ari
test,Unnamed: 1_level_1
ase,0.495721
lse,0.003095
sklearn,0.247929
