## Dataset Details

5 archetype users:
* 1: Labels consistent with image, i.e., (0, 1, 2, 3, 4) -> (0, 1, 2, 3, 4)
* 2: (0, 1, 2, 3, 4) -> (1, 2, 3, 4, 0)
* 3: (0, 1, 2, 3, 4) -> (2, 3, 4, 0, 1)
* 4: (0, 1, 2, 3, 4) -> (3, 4, 0, 1, 2)
* 5: (0, 1, 2, 3, 4) -> (4, 0, 1, 2, 3)

100 users, 20 of each type. For each user:
* inputs are randomly sampled MNIST digits (0-4)
* outputs are the labels of the associated archetype with probability .9, random label with probability .1
* 20 train examples, 20 validation, 50 test

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import numpy as np
import torch
import torchvision.datasets as datasets

In [3]:
data_path = "/data/ddmg/redditlanguagemodeling/data/MNIST"

In [16]:
mnist_train = datasets.MNIST(root=data_path, train=True, download=True)

In [20]:
len(mnist_train)

30596

In [17]:
mnist_test = datasets.MNIST(root=data_path, train=False, download=True)

In [21]:
len(mnist_test)

5139

In [18]:
keep_indices = mnist_train.targets <=4
mnist_train.data, mnist_train.targets = mnist_train.data[keep_indices], mnist_train.targets[keep_indices]

In [19]:
keep_indices = mnist_test.targets <= 4
mnist_test.data, mnist_test.targets = mnist_test.data[keep_indices], mnist_test.targets[keep_indices]

In [25]:
labels = torch.tensor([1, 2, 3, 4, 0])
test_targets = mnist_test.targets[:10]

In [26]:
test_targets

tensor([2, 1, 0, 4, 1, 4, 0, 0, 1, 3])

In [27]:
new_targets = labels[test_targets]
new_targets

tensor([3, 2, 1, 0, 2, 0, 1, 1, 2, 4])

In [34]:
def add_label_noise(labels, label_options, noise_perc):
    num_labels = len(labels)
    corrupt_count = int(num_labels * noise_perc)
    noise_idxs = np.random.choice(np.arange(num_labels), size=corrupt_count, replace=False)
    noise_labels = np.random.choice(label_options, size=corrupt_count)
    labels[noise_idxs] = torch.tensor(noise_labels)
    return labels

In [33]:
add_label_noise(test_targets, torch.tensor([0, 1, 2, 3, 4]), 0.10)

noise_idx [1]
noise_labels [4]


tensor([2, 4, 0, 4, 1, 4, 0, 0, 1, 3])

In [41]:
data_list = []
labels = torch.tensor([0, 1, 2, 3, 4])
train_idxs = np.arange(len(mnist_train.data))
train_mask = np.ones(len(train_idxs), dtype=bool)
test_idxs = np.arange(len(mnist_test.data))
test_mask = np.ones(len(test_idxs), dtype=bool)

for user in range(100):
    if user % 20 == 0 and user != 0:  # shift labels
        labels = np.roll(labels, 1)
    
    # sample train and val samples
    train_sample_idx = np.random.choice(train_idxs[train_mask], size=40, replace=False)
    train_x = mnist_train.data[train_sample_idx[:20]].flatten(1, 2)
    train_y = labels[mnist_train.targets[train_sample_idx[:20]]]
    val_x = mnist_train.data[train_sample_idx[20:]].flatten(1, 2)
    val_y = labels[mnist_train.targets[train_sample_idx[20:]]]
    # mark idxs as sampled
    train_mask[train_sample_idx] = False
    
    # sample test idxs
    test_sample_idx = np.random.choice(test_idxs[test_mask], size=50, replace=False)
    test_x = mnist_test.data[test_sample_idx].flatten(1, 2)
    test_y = labels[mnist_test.targets[test_sample_idx]]
    # mark idxs as sampled
    test_mask[test_sample_idx] = False
    
    # add label noise (just for train and val data)
    train_y = add_label_noise(train_y, labels, .10)
    val_y = add_label_noise(val_y, labels, .10)
    
    # add data to data list
    for (x, y) in zip(train_x, train_y):
        train_entry = {"x": x.tolist(), "y": y.item(), "split": "train", "user": user}
        data_list.append(train_entry)
    for (x, y) in zip(val_x, val_y):
        val_entry = {"x": x.tolist(), "y": y.item(), "split": "val", "user": user}
        data_list.append(val_entry)
    for (x, y) in zip(test_x, test_y):
        test_entry = {"x": x.tolist(), "y": y.item(), "split": "test", "user": user}
        data_list.append(test_entry)

In [43]:
data_dict = {"version": "21.12.17", "data": data_list}

In [None]:
with open(os.path.join(data_path, "5_archetype_test", "full_data.json"), "w") as f:
    json.dump(data_dict, f)