# Download/Creation of Data Sets
In this notebook, we download/create the necessary data sets for conducting the experiments presented in the accompanied article. The filepath where the data sets are stored is defined by the constant `evaluation.data_utils.DATA_PATH`.

In [None]:
import os.path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from evaluation.data_utils import DATA_PATH

from skactiveml.utils import ExtLabelEncoder

from sklearn.datasets import make_blobs, fetch_openml
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

from torch.utils.data import DataLoader, ConcatDataset

from torchvision.transforms import ToTensor
from torchvision.datasets import CIFAR10, SVHN, FashionMNIST

# Set random state to ensure reproducibility.
RANDOM_STATE = 0

### Create Toy Data Sets
The following data set is generated for illustration purposes. It is a two-dimensional binary classification problem.

In [None]:
# Generate toy data set for classification.
data_set_name = "toy-classification"
X, y_true = make_blobs(n_samples=500, centers=4, cluster_std=0.6, random_state=RANDOM_STATE)
y_true %= 2
y_true = y_true
np.save(f"{DATA_PATH}/{data_set_name}-X", X.astype(np.float32))
np.save(f"{DATA_PATH}/{data_set_name}-y-true", y_true.astype(np.int64))
plt.scatter(X[:, 0], X[:, 1], c=y_true)
plt.show()

### Download OpenML Data Sets
In the following, we download standard data sets from the [OpenML](https://www.openml.org/search?type=data) repository.

In [None]:
open_ml_data_sets = {
    "letter": (6, np.float32, np.int64),
    "fmnist": (None, np.float32, np.int64),
    "emnist": (None, np.float32, np.int64),
    "cifar10": (None, np.float32, np.int64),
    "svhn": (None, np.float32, np.int64),
}
for data_set_name, (data_id, X_type, y_true_type) in open_ml_data_sets.items():
    print(data_set_name)
    sample_path = f"{DATA_PATH}/{data_set_name}-X.npy"
    label_path = f"{DATA_PATH}/{data_set_name}-y-true.npy"
    if os.path.isfile(sample_path) and os.path.isfile(label_path):
        continue

    # Download data.
    if data_id:
        X, y_true = fetch_openml(data_id=data_id, return_X_y=True)
    else:
        X = []
        y_true = []
        if data_set_name == "fmnist":
            train_set = FashionMNIST(root=DATA_PATH, train=True, download=True, transform=ToTensor())
            test_set = FashionMNIST(root=DATA_PATH, train=False, download=True, transform=ToTensor())
        elif data_set_name == "cifar10":
            train_set = CIFAR10(root=DATA_PATH, train=True, download=True, transform=ToTensor())
            test_set = CIFAR10(root=DATA_PATH, train=False, download=True, transform=ToTensor())
        elif data_set_name == "svhn":
            train_set = SVHN(root=DATA_PATH, split="train", download=True, transform=ToTensor())
            test_set = SVHN(root=DATA_PATH, split="test", download=True, transform=ToTensor())
        loader = DataLoader(ConcatDataset([train_set, test_set]), batch_size=256, shuffle=False, num_workers=1)
        for x, y in loader:
            X.extend(x.numpy())
            y_true.extend(y.numpy())
        X = np.array(X)
        print(X.sum())
        y_true = np.array(y_true)

    # Preprocess `X`.
    if isinstance(X, pd.DataFrame):
        X = X.values
    X = X.astype(X_type)
    if data_set_name in ["fmnist", "emnist"]:
        X = X.reshape(len(X), 1, 28, 28)
    elif data_set_name in ["cifar10", "svhn"]:
        X = X.reshape(len(X), 3, 32, 32)

    # Preprocess `y_true`.
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values
    y_true = LabelEncoder().fit_transform(y_true)
    y_true = y_true.astype(np.int64)

    # Save data.
    np.save(sample_path, X)
    np.save(label_path, y_true)

### Label Me Data Set
Download the archive [Label Me](http://fprodrigues.com/deep_LabelMe.tar.gz) and extract its content as `LabelMe` directory to `DATA_PATH`.

In [None]:
data_set_name = "label-me"
label_me_path = f"{DATA_PATH}/LabelMe"
data_dict = {}

# Load train, test, and validation data.
for data_type in ["train", "test", "valid"]:
    data_dict[f"X-{data_type}"] = (
        np.load(f"{label_me_path}/prepared/data_{data_type}_vgg16.npy").astype(np.float32).reshape(-1, 8192)
    )
    data_dict[f"y-true-{data_type}"] = np.load(f"{label_me_path}/prepared/labels_{data_type}.npy").astype(np.int64)

# Rename train data.
data_dict["X"] = data_dict.pop("X-train")
data_dict["y-true"] = data_dict.pop("y-true-train")
data_dict["y"] = np.load(f"{label_me_path}/prepared/answers.npy").astype(np.int64)

# Compute annotator features.
n_annotators = data_dict["y"].shape[1]
n_classes = len(np.unique(data_dict["y-true"]))
data_dict["A"] = np.zeros((n_annotators, n_classes ** 2))
for a_idx in range(n_annotators):
    data_dict["A"][a_idx] = confusion_matrix(
        y_true=data_dict["y-true"], y_pred=data_dict["y"][:, a_idx], labels=np.arange(n_classes), normalize="all"
    ).ravel()

# Save created numpy arrays.
for key, item in data_dict.items():
    np.save(f"{DATA_PATH}/{data_set_name}-{key}.npy", item)

### Music Genre Classification Data Set
Download the archive [Music Genre Classification](http://fprodrigues.com//mturk-datasets.tar.gz) and extract its content as `music_genre_classification` directory to `DATA_PATH`.

In [None]:
data_set_name = "music"
music_path = f"{DATA_PATH}/music_genre_classification"
data_dict = {}

# Load train and test data.
train_df = pd.read_csv(f"{music_path}/music_genre_gold.csv", header=0)
train_ids = train_df["id"].values
test_df = pd.read_csv(f"{music_path}/music_genre_test.csv", header=0)
train_df_answers = pd.read_csv(f"{music_path}/music_genre_mturk.csv", header=0)

# Setup label encoder and standard scaler.
le = ExtLabelEncoder(classes=train_df["class"].unique().astype(str), missing_label="not-available")

# Separate validation data.
val_indices = np.random.RandomState(0).choice(np.arange(len(test_df)), replace=False, size=50)
is_val = np.zeros(len(test_df), dtype=bool)
is_val[val_indices] = True

# Store train data.
data_dict["y-true"] = le.fit_transform(train_df["class"].values.astype(str)).astype(np.int64)
data_dict["X"] = train_df.values[:, 1:-1].astype(np.float32)

# Store validation data.
data_dict["y-true-valid"] = le.fit_transform(test_df["class"].values[is_val].astype(str)).astype(np.int64)
data_dict["X-valid"] = test_df.values[is_val][:, 1:-1].astype(np.float32)

# Store test data.
data_dict["y-true-test"] = le.fit_transform(test_df["class"].values[~is_val].astype(str)).astype(np.int64)
data_dict["X-test"] = test_df.values[~is_val][:, 1:-1].astype(np.float32)

# Store answers.
annotators = train_df_answers["annotator"].unique()
n_annotators = len(annotators)
data_dict["y"] = np.full((len(train_df), n_annotators), fill_value="not-available").astype(str)
for row_idx, row in train_df_answers.iterrows():
    sample_idx = np.where(train_ids == row["id"])[0][0]
    annotator_idx = np.where(annotators == row["annotator"])[0][0]
    data_dict["y"][sample_idx, annotator_idx] = row["class"]
data_dict["y"] = le.fit_transform(data_dict["y"]).astype(np.int64)

# Compute annotator features.
n_annotators = data_dict["y"].shape[1]
n_classes = len(np.unique(data_dict["y-true"]))
data_dict["A"] = np.zeros((n_annotators, n_classes ** 2))
for a_idx in range(n_annotators):
    data_dict["A"][a_idx] = confusion_matrix(
        y_true=data_dict["y-true"], y_pred=data_dict["y"][:, a_idx], labels=np.arange(n_classes), normalize="all"
    ).ravel()

# Save created numpy arrays.
for key, item in data_dict.items():
    np.save(f"{DATA_PATH}/{data_set_name}-{key}.npy", item)