In [1]:
%load_ext lab_black
%load_ext autotime
import numpy as np
import pandas as pd

import drnb as nb

time: 575 ms (started: 2022-09-20 07:43:22 -07:00)


One of the datasets used by [T-SNE Is Not Optimized to Reveal Clusters in Data](https://arxiv.org/abs/2110.02573) and [Stochastic Cluster Embedding](https://arxiv.org/abs/2108.08003) (SCE). It is suggested there that this dataset should be easy to get obvious clusters in the output, but that t-SNE fails to do so. The others are `cytometry`, `higgs`, `ijcnn` and `tomoradar`.

In [2]:
from io import BytesIO

import ncompress
import requests

training_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z"
req = requests.get(
    training_url,
    timeout=10,
)
training_data = pd.read_csv(
    BytesIO(ncompress.decompress(req.content)), header=None, delim_whitespace=True
)
training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77,0,28,0,27,48,22,2
1,55,0,92,0,0,26,36,92,56,4
2,53,0,82,0,52,-5,29,30,2,1
3,37,0,76,0,28,18,40,48,8,1
4,37,0,79,0,34,-26,43,46,2,1
...,...,...,...,...,...,...,...,...,...,...
43495,46,5,78,0,46,5,32,32,0,1
43496,37,0,79,-1,10,3,43,69,26,1
43497,48,0,78,3,46,0,30,32,2,1
43498,41,0,79,0,38,-25,38,40,2,1


time: 403 ms (started: 2022-09-20 07:43:23 -07:00)


In [3]:
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst"
test_data = pd.read_csv(test_url, header=None, delim_whitespace=True)
test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,55,0,81,0,-6,11,25,88,64,4
1,56,0,96,0,52,-4,40,44,4,4
2,50,-1,89,-7,50,0,39,40,2,1
3,53,9,79,0,42,-2,25,37,12,4
4,55,2,82,0,54,-6,26,28,2,1
...,...,...,...,...,...,...,...,...,...,...
14495,80,0,84,0,-36,-29,4,120,116,5
14496,55,0,81,0,-20,25,26,102,76,4
14497,55,0,77,0,12,-22,22,65,42,4
14498,37,0,103,0,18,-16,66,85,20,1


time: 176 ms (started: 2022-09-20 07:43:23 -07:00)


In [4]:
data = pd.concat([training_data, test_data], ignore_index=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,50,21,77,0,28,0,27,48,22,2
1,55,0,92,0,0,26,36,92,56,4
2,53,0,82,0,52,-5,29,30,2,1
3,37,0,76,0,28,18,40,48,8,1
4,37,0,79,0,34,-26,43,46,2,1
...,...,...,...,...,...,...,...,...,...,...
57995,80,0,84,0,-36,-29,4,120,116,5
57996,55,0,81,0,-20,25,26,102,76,4
57997,55,0,77,0,12,-22,22,65,42,4
57998,37,0,103,0,18,-16,66,85,20,1


time: 10.4 ms (started: 2022-09-20 07:43:23 -07:00)


In [5]:
target_labels = np.array(
    [
        "",
        "Rad Flow",
        "Fpv Close",
        "Fpv Open",
        "High",
        "Bypass",
        "Bpv Close",
        "Bpv Open",
    ]
)

time: 1.73 ms (started: 2022-09-20 07:43:24 -07:00)


In [6]:
from drnb.util import categorize

target = pd.DataFrame(
    dict(label=data.iloc[:, -1], description=target_labels[data.iloc[:, -1]])
)
categorize(target, "description")
target

Unnamed: 0,label,description
0,2,Fpv Close
1,4,High
2,1,Rad Flow
3,1,Rad Flow
4,1,Rad Flow
...,...,...
57995,5,Bypass
57996,4,High
57997,4,High
57998,1,Rad Flow


time: 24.3 ms (started: 2022-09-20 07:43:24 -07:00)


In [7]:
data = data.iloc[:, :-1]
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,50,21,77,0,28,0,27,48,22
1,55,0,92,0,0,26,36,92,56
2,53,0,82,0,52,-5,29,30,2
3,37,0,76,0,28,18,40,48,8
4,37,0,79,0,34,-26,43,46,2
...,...,...,...,...,...,...,...,...,...
57995,80,0,84,0,-36,-29,4,120,116
57996,55,0,81,0,-20,25,26,102,76
57997,55,0,77,0,12,-22,22,65,42
57998,37,0,103,0,18,-16,66,85,20


time: 8.27 ms (started: 2022-09-20 07:43:24 -07:00)


In [8]:
from drnb.io.pipeline import create_default_pipeline

data_result = create_default_pipeline(check_for_duplicates=True).run(
    "shuttle",
    data=data,
    target=target,
    tags=["lowdim"],
    verbose=True,
    url="https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)",
)

time: 46.6 s (started: 2022-09-20 07:43:24 -07:00)
