In [1]:
import pandas as pd

`s1k` is a synthetic dataset I created: it's a fuzzy 9D simplex, consisting of 10 Gaussian clusters
with centers equidistant from each other. It is designed to illustrate the problem of the "crowding
effect" for dimensionality reduction: there is definitely no way to faithfully represent the 
distances in 2D. It's easy to write code to generate this on demand, but I forgot exactly what I 
did here, and because this is a small dataset (`shape` is `(1000, 9)`) I have just added it to this 
repo.

In [2]:
from pathlib import Path

data_path = Path.cwd().parent.parent / "data"
s1k_data_path = data_path / "s1k_data.csv.gz"
s1k_target_path = data_path / "s1k_target.csv.gz"

In [3]:
import gzip

with gzip.open(s1k_target_path, "rb") as f:
    s1k_target = pd.read_csv(f)
s1k_target["label"] = s1k_target["label"].astype("category")
s1k_target

Unnamed: 0,label
0,0
1,1
2,2
3,3
4,4
...,...
995,9
996,9
997,9
998,9


In [4]:
with gzip.open(s1k_data_path, "rb") as f:
    s1k_data = pd.read_csv(f, header=None)
s1k_data = s1k_data.to_numpy()

In [5]:
s1k_data[:5], s1k_data.shape

(array([[ 1.34164079,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [-0.1490712 ,  1.33333333,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [-0.1490712 , -0.16666667,  1.32287566,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [-0.1490712 , -0.16666667, -0.18898224,  1.30930734,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [-0.1490712 , -0.16666667, -0.18898224, -0.21821789,  1.29099445,
          0.        ,  0.        ,  0.        ,  0.        ]]),
 (1000, 9))

In [6]:
from drnb.io.pipeline import create_default_pipeline

data_pipe = create_default_pipeline(check_for_duplicates=True).run(
    "s1k",
    data=s1k_data,
    target=s1k_target,
    tags=["small", "lowdim"],
    verbose=True,
)