In [1]:
%load_ext lab_black
%load_ext autotime

time: 884 µs (started: 2023-04-05 22:47:47 -07:00)


Taken from the [topological autoencoders github repo](https://github.com/BorgwardtLab/topological-autoencoders) (see also [the paper](https://arxiv.org/abs/1906.00722)) and used in the [UMATO](https://arxiv.org/abs/2205.00420) paper. I have made minor modifications to the code for formatting and to remove some options that were superfluous for generating a dataset like that used by UMATO. These functions should be considered licensed under the same license as the original TA code: [BSD 3-clause](https://github.com/BorgwardtLab/topological-autoencoders/blob/203e94a69c5f9cda049b9c3985b7c2b1e39ca922/LICENSE).

In [2]:
import numpy as np

#  n_samples=500, d=100, n_spheres=11, r=5,

# For a synthetic dataset, we used Spheres [28]; it consists
# of 11 101-dimensional spheres, where ten spheres with relatively
# small radius of 5 and the number of points of 500 are enclosed by a
# larger sphere with a radius of 25 and the number of 5,000. A total of
# 10,000 points form the data.


# https://github.com/BorgwardtLab/topological-autoencoders/blob/203e94a69c5f9cda049b9c3985b7c2b1e39ca922/src/datasets/topo_dataset/spheres.py
def create_sphere_dataset(n_samples=500, d=100, n_spheres=11, r=5, seed=42):
    np.random.seed(seed)

    # it seemed that rescaling the shift variance by sqrt of d lets big sphere stay around the inner spheres
    variance = 10 / np.sqrt(d)

    shift_matrix = np.random.normal(0, variance, [n_spheres, d + 1])

    spheres = []
    n_datapoints = 0
    for i in np.arange(n_spheres - 1):
        sphere = dsphere(n=n_samples, d=d, r=r)
        spheres.append(sphere + shift_matrix[i, :])
        n_datapoints += n_samples

    # Additional big surrounding sphere:
    n_samples_big = 10 * n_samples  # int(n_samples/2)
    big = dsphere(n=n_samples_big, d=d, r=r * 5)
    spheres.append(big)
    n_datapoints += n_samples_big

    # Create Dataset:
    dataset = np.concatenate(spheres, axis=0)

    labels = np.zeros(n_datapoints)
    label_index = 0
    for index, data in enumerate(spheres):
        n_sphere_samples = data.shape[0]
        labels[label_index : label_index + n_sphere_samples] = index
        label_index += n_sphere_samples

    return dataset, labels


# https://github.com/BorgwardtLab/topological-autoencoders/blob/203e94a69c5f9cda049b9c3985b7c2b1e39ca922/src/datasets/topo_dataset/custom_shapes.py
def dsphere(n=100, d=2, r=1, noise=None):
    """
    Sample `n` data points on a d-sphere.
    Parameters
    -----------
    n : int
        Number of data points in shape.
    r : float
        Radius of sphere.
    """
    data = np.random.randn(n, d + 1)

    # Normalize points to the sphere
    data = r * data / np.sqrt(np.sum(data**2, 1)[:, None])

    if noise:
        data += noise * np.random.randn(*data.shape)

    return data

time: 112 ms (started: 2023-04-05 22:47:47 -07:00)


In [3]:
data, labels = create_sphere_dataset()

time: 37 ms (started: 2023-04-05 22:47:47 -07:00)


In [4]:
data.shape

(10000, 101)

time: 4.97 ms (started: 2023-04-05 22:47:47 -07:00)


In [5]:
labels

array([ 0.,  0.,  0., ..., 10., 10., 10.])

time: 27.4 ms (started: 2023-04-05 22:47:47 -07:00)


In [6]:
import pandas as pd

from drnb.util import categorize

target = pd.DataFrame({"label": labels.astype(int)})
categorize(target, "label")
target

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
...,...
9995,10
9996,10
9997,10
9998,10


time: 186 ms (started: 2023-04-05 22:47:47 -07:00)


In [7]:
from drnb.io.pipeline import create_default_pipeline

_ = create_default_pipeline().run(
    "spheres",
    data=data,
    target=target,
    tags=["synthetic"],
    url="https://github.com/BorgwardtLab/topological-autoencoders",
    verbose=True,
)

time: 16.1 s (started: 2023-04-05 22:47:48 -07:00)
