In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
import pickle
import os

from sklearn.manifold import trustworthiness
from sklearn.metrics.pairwise import euclidean_distances

from scipy.stats import spearmanr

In [2]:
class Metrics():
    def __init__(self, highdata, lowdata=None, K=7, metric="euclidean"):
        self.K = K
        self.N = highdata.shape[0]
        self.high = highdata
        self.metric = metric
        self.low = None
        
        self.high_distance_matrix = None
        self.low_distance_matrix = None
        
        self.high_rank_matrix = None
        self.low_rank_matrix = None
        
        self.trustworthiness = None
        self.continuity = None
        self.normalised_stress = None
        self.neighbourhood_hit = None
        self.shepard_goodness = None
        self.average_local_errors = None

        
        self.compute_high_distance_matrix()
        self.compute_high_rank_matrix()
        
        if lowdata is not None:
            self.compute_low_distance_matrix()
            self.compute_low_rank_matrix()

    def compute_high_distance_matrix(self):
        """ Computes distance matrix of high dimensional data """
        assert self.metric == "euclidean"
        self.high_distance_matrix = euclidean_distances(self.high)
        
    def compute_low_distance_matrix(self):
        """ Computes distance matrix of low dimensional data """
        assert self.metric == "euclidean"
        assert self.low is not None
        self.low_distance_matrix = euclidean_distances(self.low)
        
    def compute_high_rank_matrix(self):
        self.high_rank_matrix = self._compute_rank_matrix(self.high_distance_matrix)
    
    def compute_low_rank_matrix(self):
        self.low_rank_matrix = self._compute_rank_matrix(self.low_distance_matrix)
    
    def _compute_rank_matrix(self, matrix):
        return np.array([np.argsort(np.argsort(r)) for r in matrix])
        
    def set_low_data(self, low):
        self.low = low
        self.compute_low_distance_matrix()
        self.compute_low_rank_matrix()
        
    def set_labels(self, labels):
        self.labels = labels
        
    def get_trustworthiness(self):
        assert self.high_distance_matrix is not None
        assert self.low_distance_matrix is not None
        
        self.trustworthiness = trustworthiness(X=self.high_distance_matrix, X_embedded=self.low_distance_matrix)
        return self.trustworthiness
    
    def get_continuity(self):
        assert self.low_rank_matrix is not None
        assert self.high_rank_matrix is not None
        
        self.continuity = 0
        
        high_neighbours = np.concatenate([
            np.where(self.high_rank_matrix == k)[1].reshape(-1, 1) for k in range(1, self.K+1)
        ], axis=1)
        
        for j in range(self.K):
            self.continuity += np.sum(
                    np.max(
                        [np.zeros(self.N, dtype=int),
                         (self.low_rank_matrix[range(self.N), high_neighbours[:,j]]) - self.K],
                        axis=0
                    )
            )
        self.continuity = 1 - (2 / (self.N*self.K*(2*self.N-3*self.K-1)))*self.continuity
            
        return self.continuity
    
    def get_normalised_stress(self):
        assert self.high_distance_matrix is not None
        assert self.low_distance_matrix is not None
        
        indices = np.triu_indices(self.N, k=1)
        
        
        
        self.normalised_stress = 1 - (
            np.sum((self.high_distance_matrix[indices] - self.low_distance_matrix[indices]) ** 2)
            /
            np.sum((self.high_distance_matrix[indices])**2)
        )
        return self.normalised_stress
        
    
    def get_neighbourhood_hit(self):
        assert self.low_rank_matrix is not None
        assert self.high_rank_matrix is not None
        
        self.neighbourhood_hit = 0
        
        low_neighbours = np.concatenate([
            np.where(self.low_rank_matrix == k)[1].reshape(-1, 1) for k in range(1, self.K+1)
        ], axis=1)
        
        for i in range(self.N):
            self.neighbourhood_hit += (
                np.sum(self.labels[low_neighbours[i,:]] == self.labels[i])
            )
        self.neighbourhood_hit = self.neighbourhood_hit / (self.N * self.K)
        return self.neighbourhood_hit
        
    def get_shepard_goodnees(self):
        assert self.high_distance_matrix is not None
        assert self.low_distance_matrix is not None
        
        indices = np.triu_indices(self.N, k=1)
        
        self.shepard_goodness = spearmanr(
            self.high_distance_matrix[indices],
            self.low_distance_matrix[indices]
        )
        
        return self.shepard_goodness.statistic
    
    def get_average_local_error(self):
        assert self.high_distance_matrix is not None
        assert self.low_distance_matrix is not None
        
        indices = np.triu_indices(self.N, k=1)
        max_high = np.max(self.high_distance_matrix[indices])
        max_low = np.max(self.low_distance_matrix[indices])
        
        self.average_local_errors = (1/(self.N-1)) * np.sum(np.abs(
            (1/max_high) * self.high_distance_matrix[indices]
            -
            (1/max_low) * self.low_distance_matrix[indices]
        ))
        
        return self.average_local_errors
    
    def get_metrics(self, lowdata=None, labels=None, mean_score=False):
        if lowdata is not None:
            self.set_low_data(lowdata)
            
        if labels is not None:
            self.set_labels(labels)
            
        tw = self.get_trustworthiness()
        ct = self.get_continuity()
        ns = self.get_normalised_stress()
        
        if labels is not None:
            nh = self.get_neighbourhood_hit()
        else:
            nh = float('nan')
        
        sg = self.get_shepard_goodnees()
        
        if mean_score:
            if labels is not None:
                score = 0.2 * (tw + ct + ns + nh + sg)
            else:
                score = 0.25 * (tw + ct + ns + sg)
            return score
        else:
            if labels is not None:
                return [tw, ct, ns, nh, sg]
            else:
                return [tw, ct, ns, sg]

In [None]:
def evaluate_experiments(ds_name, experiment_name, extra_name="", has_labels=False, verbose=0):
    HP = pd.read_csv(f"synth_data_gen/experiments{extra_name}/{experiment_name}.csv")
    params = pd.read_csv(f"synth_data_gen/synth_datasets/{ds_name}/params.csv")
    
    low_data = dict()
    high_data = dict()
    
    metrics = dict()
    
    for idx in params.index:
        try:
            with open(f'synth_data_gen/synth_datasets/{ds_name}/set_{idx}.pickle', 'rb') as handle:
                high_data[idx] = pickle.load(handle)
        except Exception as e:
            print(f"Failed to load high dimensional data, idx={idx}")
            print(e)
            continue

        for i in HP.index:
            try:
                with open(f'synth_data_gen/experiments/{ds_name}/{experiment_name}/set_{idx}_HP_{i}', 'rb') as handle:
                    low_data[(idx, i)] = pickle.load(handle)
            except Exception as e:
                print(f"Failed to load low dimensional data, idx={idx}, i={i}")
                print(e)
                continue

        shape_metric = Metrics(high_data[idx]["data"])
        if has_labels:
            if "labels" in high_data[idx].keys():
                labels = high_data[idx]["labels"]
                if type(labels) == list:
                    labels = np.array(labels, dtype=int)
            elif "classes" in high_data[idx].keys():
                labels = high_data[idx]["classes"]
            else:
                raise Exception("Found no label data named 'labels' or 'classes'")

        else:
            labels = None
        if verbose > 0:
            print(f"Computing metrics for set {idx}")
        m = []
        for i in HP.index:
            if verbose > 1:
                print(f"\t HPs number {i}")
            try:
                m.append(
                    shape_metric.get_metrics(lowdata = low_data[(idx,i)]["data"], labels=labels)
                )
            except Exception as e:
                if verbose > 1:
                    print(f"\t\t Exception (e); passing these HPs")
        metrics_df = pd.concat([HP, pd.DataFrame(m)], axis=1)
        
        if has_labels:
            metric_names = ["trustworthiness", "continuity", "normalised_stress", "neighbourhood_hit", "shepard_goodness"]
        else:
            metric_names = ["trustworthiness", "continuity", "normalised_stress", "shepard_goodness"]
        
        metrics[idx] = metrics_df.rename(columns={i: metric_names[i] for i in range(len(metric_names))})

    return metrics, params

# Cylinder

In [61]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "cylinder"

cylinder_metrics = dict()
cylinder_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    cylinder_metrics[en], cylinder_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

Metrics for experiment neighbor_init_sweep



KeyboardInterrupt



# Gaussian clusters plane

In [29]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "gaussian_clusters_plane"

gcp_metrics = dict()
gcp_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    gcp_metrics[en], gcp_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=True, verbose=2)

Metrics for experiment neighbor_init_sweep


NameError: name 'evaluate_experiments' is not defined

In [31]:
ds_name = "gaussian_clusters_plane"
gcp_params = pd.read_csv(f"synth_data_gen/synth_datasets/gaussian_clusters_plane/params.csv")

with open(f'synth_data_gen/experiments/{ds_name}/metrics.pickle', 'rb') as handle:
    gcp_metrics = pickle.load(handle)

In [32]:
gcp_params

Unnamed: 0,cluster_size,final_dim,noise_points
0,30,4,0
1,30,4,100
2,30,4,400
3,30,8,0
4,30,8,100
5,30,8,400
6,30,16,0
7,30,16,100
8,30,16,400
9,30,32,0


In [35]:
gcp_metrics["bhtsne_perplexity_sweep"][47]

Unnamed: 0,perplexity,trustworthiness,continuity,normalised_stress,neighbourhood_hit,shepard_goodness
0,15,0.876265,0.960992,0.422667,0.851173,0.4425
1,30,0.895105,0.965081,0.678311,0.882398,0.438938
2,60,0.921775,0.967285,0.725646,0.921327,0.467556
3,100,0.938294,0.96746,0.721495,0.946735,0.473238


In [36]:
gcp_metrics["hyp1"][47]

Unnamed: 0,n_neighbors,min_dist,spread,learning_rate,trustworthiness,continuity,normalised_stress,neighbourhood_hit,shepard_goodness
0,223,0.301783,0.601329,0.272597,0.85987,0.957957,0.42602,0.842092,0.442677
1,141,3.819508,7.873089,0.417392,0.869995,0.961016,-0.566333,0.858418,0.489201
2,123,1.241533,1.494626,0.574509,0.864138,0.960074,0.636358,0.844337,0.473072
3,246,0.108908,4.347207,0.234977,0.872006,0.961272,0.640153,0.853112,0.535605
4,61,0.210397,3.492269,0.015676,0.877506,0.959853,0.583938,0.872296,0.432139
5,99,0.055639,9.646996,0.039412,0.89647,0.948607,0.701986,0.906735,0.40537
6,200,0.156191,0.829579,0.068883,0.864808,0.959952,0.533022,0.84199,0.491917
7,245,0.217772,2.61849,0.121135,0.868107,0.961327,0.748985,0.849694,0.554693
8,82,0.071311,4.949885,0.028797,0.881493,0.960083,0.68384,0.875816,0.439279
9,299,0.517683,0.989315,0.844336,,,,,


In [17]:
gcp_metrics["hyp1"][47]

Unnamed: 0,n_neighbors,min_dist,spread,learning_rate,trustworthiness,continuity,normalised_stress,neighbourhood_hit,shepard_goodness
0,223,0.301783,0.601329,0.272597,0.85987,0.957957,0.42602,0.842092,0.442677
1,141,3.819508,7.873089,0.417392,0.869995,0.961016,-0.566333,0.858418,0.489201
2,123,1.241533,1.494626,0.574509,0.864138,0.960074,0.636358,0.844337,0.473072
3,246,0.108908,4.347207,0.234977,0.872006,0.961272,0.640153,0.853112,0.535605
4,61,0.210397,3.492269,0.015676,0.877506,0.959853,0.583938,0.872296,0.432139
5,99,0.055639,9.646996,0.039412,0.89647,0.948607,0.701986,0.906735,0.40537
6,200,0.156191,0.829579,0.068883,0.864808,0.959952,0.533022,0.84199,0.491917
7,245,0.217772,2.61849,0.121135,0.868107,0.961327,0.748985,0.849694,0.554693
8,82,0.071311,4.949885,0.028797,0.881493,0.960083,0.68384,0.875816,0.439279
9,299,0.517683,0.989315,0.844336,,,,,


In [15]:
gcp_metrics["fitsne_perplexity_sweep"][47]

Unnamed: 0,perplexity,trustworthiness,continuity,normalised_stress,neighbourhood_hit,shepard_goodness
0,15,0.869469,0.959278,-0.214229,0.845255,0.430725
1,30,0.890139,0.963688,0.410232,0.874745,0.464739
2,60,0.921826,0.967083,0.697818,0.920561,0.48387
3,100,0.939105,0.967397,0.716335,0.947806,0.484634


# Half Cylinder

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "half_cylinder"

half_cylinder_metrics = dict()
half_cylinder_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    half_cylinder_metrics[en], half_cylinder_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

# Hilbert

In [None]:
experiment_names = [
    "neighbor_init_sweep_2d",
    "mindist_spread_sweep_2d",
    "fitsne_perplexity_sweep_2d",
    "bhtsne_perplexity_sweep_2d",
    "neighbor_init_sweep_1d",
    "mindist_spread_sweep_1d",
    "fitsne_perplexity_sweep_1d",
    "bhtsne_perplexity_sweep_1d",
    "hyp1"
]

ds_name = "hilbert"

hilbert_metrics = dict()
hilbert_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    hilbert_metrics[en], hilbert_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

In [18]:
ds_name = "hilbert"
hilbert_params = pd.read_csv(f"synth_data_gen/synth_datasets/{ds_name}/params.csv")

with open(f'synth_data_gen/experiments/{ds_name}/metrics.pickle', 'rb') as handle:
    hilbert_metrics = pickle.load(handle)

In [23]:
hilbert_params

Unnamed: 0,n,p,inbetween,final_dim
0,3,2,5,4
1,3,2,5,8
2,3,2,5,16
3,3,2,5,32
4,3,2,15,4
5,3,2,15,8
6,3,2,15,16
7,3,2,15,32
8,3,2,30,4
9,3,2,30,8


In [26]:
hilbert_metrics["hyp1"][11]

Unnamed: 0,n_neighbors,min_dist,spread,learning_rate,trustworthiness,continuity,normalised_stress,shepard_goodness
0,223,0.301783,0.601329,0.272597,0.99875,0.998396,-16.472024,0.545213
1,141,3.819508,7.873089,0.417392,0.998716,0.998473,-331.973689,0.573515
2,123,1.241533,1.494626,0.574509,0.998512,0.998286,-45.620158,0.481678
3,246,0.108908,4.347207,0.234977,0.998922,0.998511,-81.063711,0.57013
4,61,0.210397,3.492269,0.015676,0.99875,0.99826,-11.001235,0.624986
5,99,0.055639,9.646996,0.039412,0.997199,0.997868,-50.636631,0.630353
6,200,0.156191,0.829579,0.068883,0.998926,0.99849,-5.412106,0.582852
7,245,0.217772,2.61849,0.121135,0.998934,0.998531,-33.038102,0.556458
8,82,0.071311,4.949885,0.028797,0.99889,0.998467,-22.583451,0.623214
9,299,0.517683,0.989315,0.844336,,,,


In [25]:
hilbert_metrics["fitsne_perplexity_sweep"][11]

Unnamed: 0,perplexity,trustworthiness,continuity,normalised_stress,shepard_goodness
0,15,0.99877,0.998697,-284.345111,0.62807
1,30,0.999005,0.998761,-198.570436,0.605771
2,60,0.999129,0.998745,-128.409375,0.671702
3,100,0.999151,0.99878,-80.538349,0.681362


# MNIST

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "hyp1"
]

ds_name = "mnist"

mnist_metrics = dict()
mnist_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    mnist_metrics[en], mnist_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=True)

# Punto silla

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "punto_silla"

punto_silla_metrics = dict()
punto_silla_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    punto_silla_metrics[en], punto_silla_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

# Shapes

In [81]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "shapes"

shapes_metrics = dict()
shapes_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    shapes_metrics[en], shapes_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False, verbose=2)

Metrics for experiment fitsne_perplexity_sweep
Computing metrics for set 0
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3
Computing metrics for set 1
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3
Computing metrics for set 2
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3
Computing metrics for set 3
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3


In [27]:
ds_name = "shapes"
shapes_params = pd.read_csv(f"synth_data_gen/synth_datasets/{ds_name}/params.csv")

with open(f'synth_data_gen/experiments/{ds_name}/metrics.pickle', 'rb') as handle:
    shapes_metrics = pickle.load(handle)

In [28]:
shapes_params

Unnamed: 0,dims
0,4
1,8
2,16
3,32


# Shapes noise

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "shapes_noise"

shapes_noise_metrics = dict()
shapes_noise_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    shapes_noise_metrics[en], shapes_noise_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=True)

# Sphere

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "sphere"

sphere_metrics = dict()
sphere_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    sphere_metrics[en], sphere_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

# Sphere uniform

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "sphere_unif"

sphere_unif_metrics = dict()
sphere_unif_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    sphere_unif_metrics[en], sphere_unif_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

# Swiss roll

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "fitsne_perplexity_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "swiss_roll"

swiss_roll_metrics = dict()
swiss_roll_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    swiss_roll_metrics[en], swiss_roll_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

# Torus

In [None]:
experiment_names = [
    "neighbor_init_sweep",
    "mindist_spread_sweep",
    "bhtsne_perplexity_sweep",
    "hyp1"
]

ds_name = "torus"

torus_metrics = dict()
torus_params = None
for en in experiment_names:
    print(f"Metrics for experiment {en}")
    torus_metrics[en], torus_params = evaluate_experiments(experiment_name=en,
                     ds_name=ds_name,
                     has_labels=False)

Metrics for experiment bhtsne_perplexity_sweep
Computing metrics for set 0
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3
Computing metrics for set 1
	 HPs number 0
	 HPs number 1
	 HPs number 2
	 HPs number 3
Computing metrics for set 2
	 HPs number 0
	 HPs number 1
	 HPs number 2



KeyboardInterrupt

