In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from t_sne import *
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.manifold import TSNE as s_TSNE

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from tqdm import tqdm

import scanpy as sc 
import sklearn.metrics as sm
from scipy.spatial.distance import pdist
from scipy.stats import spearmanr

import pprint
import copy

## Data preparation

In [13]:
X = pd.read_csv("data/mnist_train.csv")
X_numpy = X.iloc[:6000, 1:].to_numpy().astype('float64')
labels = X.iloc[:6000, 0].values

In [14]:
print(X_numpy[:5])
print(X_numpy.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(6000, 784)


In [15]:
labels

array([5, 0, 4, ..., 8, 6, 9], dtype=int64)

## Evaluation functions

In [16]:
def get_src(data, embedding):
    original_distances = pdist(data, metric='euclidean')
    embedded_distances = pdist(embedding, metric='euclidean')
    spearman_corr, _ = spearmanr(original_distances, embedded_distances)
    return spearman_corr

def plot_embedding(embedding, labels):
    fig, axs = plt.subplots(1, 2, figsize=(16, 8))
    handles = []
    for i, label in enumerate(np.unique(labels)):
        handle = axs[0].scatter(embedding[labels==label, 0], embedding[labels==label, 1], s=1., label=label)
        handles.append(handle)

    axs[1].legend(handles, np.unique(labels), ncol=2)

    plt.xlabel("EMB1")
    plt.ylabel("EMB2")
    plt.show()

def get_clustering_metrics(embedding, labels):
    anndata = sc.AnnData(embedding)
    sc.pp.neighbors(anndata, use_rep="X")
    sc.tl.leiden(anndata)
    cluster_labels = anndata.obs["leiden"].values
    print("Number of Leiden clusters:", cluster_labels.shape)
    ari = sm.adjusted_rand_score(labels, cluster_labels)
    ami = sm.adjusted_mutual_info_score(labels, cluster_labels)
    homogeneity = sm.homogeneity_score(labels, cluster_labels)
    completeness = sm.completeness_score(labels, cluster_labels)
    print("""
    ARI: {}\n
    AMI: {}\n
    Homogenuity: {}\n
    Completeness: {}\n
    Avarage: {}
    """.format(ari, ami, homogeneity, completeness, ari+ami+homogeneity+completeness/4))
    return ari, ami, homogeneity, completeness

## Experiments

### No optimization

In [18]:
tsne1 = TSNE(
    n_components=2,
    num_iters=1000,
    compression_period=0,
    random_walk=False
)

In [19]:
tsne1.get_highdimensional_similarities(X_numpy)

Finding sigmas


100%|██████████| 6000/6000 [01:38<00:00, 60.78it/s]


In [20]:
Y1 = tsne1.fit()

Running gradient descent...


  D_recip = np.reciprocal(1 + D)
 16%|█▌        | 161/1000 [12:36<1:05:43,  4.70s/it]


KeyboardInterrupt: 

In [None]:
tsne1.plot_metrics()

In [None]:
plot_embedding(Y, labels)

In [None]:
src = get_src(X_numpy, Y)
kld = tsne1.metrics["kl_divergence"][-1]

print("KLD", kld)
print("SRC", src)

In [None]:
get_clustering_metrics(Y1, labels)

### Early compression

In [None]:
tsne2 = TSNE(
    n_components=2,
    num_iters=1000,
    compression_period=300,
    compression_term=1e-2,
    random_walk=False
)

In [None]:
tsne1.get_highdimensional_similarities(X_numpy)

In [None]:
Y2 = tsne1.fit()

In [None]:
tsne2.plot_metrics()

In [None]:
plot_embedding(Y2, labels)

In [None]:
src = get_src(X_numpy, Y2)
kld = tsne2.metrics["kl_divergence"][-1]

print("KLD", kld)
print("SRC", src)

In [None]:
get_clustering_metrics(Y1, labels)