In [1]:
import pandas as pd

Based on the penguins example in [How to Use UMAP](https://umap-learn.readthedocs.io/en/latest/basic_usage.html).

In [2]:
penguins_raw = pd.read_csv(
    "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv"
)
penguins_raw.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
from drnb.io.pipeline import create_default_pipeline

data_pipe = create_default_pipeline(
    check_for_duplicates=True,
    scale="z",
)

In [4]:
data = penguins_raw[
    ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
]
target = penguins_raw.loc[:, ["island", "sex", "species"]]
target["species"] = target["species"].astype("category")

In [5]:
data_result = data_pipe.run(
    "penguins",
    data=data,
    target=target,
    url="https://allisonhorst.github.io/palmerpenguins/",
    tags=["small", "lowdim"],
    verbose=True,
)

In [6]:
print(data_result.to_json(indent=4))

{
    "pipeline": "DatasetPipeline(convert_args={'dtype': 'float32', 'layout': 'c'}, scale_action='z', check_for_duplicates=True, reduce=None, reduce_result=None, drnb_home=None, data_sub_dir='data', data_exporters=[FileExporter(drnb_home=None, sub_dir=None, suffix=None, create_sub_dir=True, verbose=False, file_type='npy'), FileExporter(drnb_home=None, sub_dir=None, suffix=None, create_sub_dir=True, verbose=False, file_type='csv')], target_exporters=[FileExporter(drnb_home=None, sub_dir=None, suffix=None, create_sub_dir=True, verbose=False, file_type='feather'), FileExporter(drnb_home=None, sub_dir=None, suffix=None, create_sub_dir=True, verbose=False, file_type='csv')], neighbors_request=NeighborsRequest(n_neighbors=[16, 51, 151], method='exact', metric=['euclidean'], file_types=['npy', 'csv'], params={}, verbose=True), triplets_request=TripletsRequest(n_triplets_per_point=5, seed=1337, file_types=['npy', 'csv'], metric=['euclidean']), verbose=True)",
    "started_on": "20250119044106