# Download of Data Sets
In this notebook, we download the necessary data sets for conducting the experiments presented in the accompanied article. The filepath where the data sets are stored is defined by the constant `evaluation.data_utils.DATA_PATH`.

In [None]:
import os.path

import numpy as np
import pandas as pd

from evaluation.data_utils import DATA_PATH

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import DataLoader, ConcatDataset

from torchvision.transforms import ToTensor
from torchvision.datasets import CIFAR10

# Set random state to ensure reproducibility.
RANDOM_STATE = 0

### Download OpenML Data Sets
In the following, we download standard data sets from the [OpenML](https://www.openml.org/search?type=data) repository.

In [None]:
open_ml_data_sets = {
    "letter": (6, np.float32, np.int64),
    "cifar10": (None, np.float32, np.int64),
}
for data_set_name, (data_id, X_type, y_true_type) in open_ml_data_sets.items():
    print(data_set_name)
    sample_path = f"{DATA_PATH}/{data_set_name}-X.npy"
    label_path = f"{DATA_PATH}/{data_set_name}-y-true.npy"
    if os.path.isfile(sample_path) and os.path.isfile(label_path):
        continue

    # Download data.
    if data_id:
        X, y_true = fetch_openml(data_id=data_id, return_X_y=True)
    else:
        X = []
        y_true = []
        if data_set_name == "cifar10":
            train_set = CIFAR10(root=DATA_PATH, train=True, download=True, transform=ToTensor())
            test_set = CIFAR10(root=DATA_PATH, train=False, download=True, transform=ToTensor())
        loader = DataLoader(ConcatDataset([train_set, test_set]), batch_size=256, shuffle=False, num_workers=1)
        for x, y in loader:
            X.extend(x.numpy())
            y_true.extend(y.numpy())
        X = np.array(X)
        print(X.sum())
        y_true = np.array(y_true)

    # Preprocess `X`.
    if isinstance(X, pd.DataFrame):
        X = X.values
    X = X.astype(X_type)
    if data_set_name in ["cifar10"]:
        X = X.reshape(len(X), 3, 32, 32)

    # Preprocess `y_true`.
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    y_true = LabelEncoder().fit_transform(y_true)
    y_true = y_true.astype(np.int64)

    # Save data.
    np.save(sample_path, X)
    np.save(label_path, y_true)