In [2]:
import numpy as np
from torchvision import datasets, transforms

In [None]:
def cifar_iid(dataset, num_users):
    """
    Sample I.I.D. client data from CIFAR10 dataset
    :param dataset:
    :param num_users:
    :return: dict of image index
    """
    num_items = int(len(dataset)/num_users)
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items, replace=False))
        all_idxs = list(set(all_idxs) - dict_users[i])
    return dict_users


def cifar_noniid(dataset_label, num_clients, num_classes, q):
    """
    Sample I.I.D. client data from CIFAR10 dataset
    :param dataset:
    :param num_users:
    :return: dict of image index
    """
    proportion = non_iid_distribution_group(dataset_label, num_clients, num_classes, q)
    dict_users = non_iid_distribution_client(proportion, num_clients, num_classes)
    #  output clients' labels information
    check_data_each_client(dataset_label, dict_users, num_clients, num_classes)
    return dict_users

def non_iid_distribution_group(dataset_label, num_clients, num_classes, q):
    dict_users, all_idxs = {}, [i for i in range(len(dataset_label))]
    for i in range(num_classes):
        dict_users[i] = set([])
    for k in range(num_classes):
        idx_k = np.where(dataset_label == k)[0]
        num_idx_k = len(idx_k)
        
        selected_q_data = set(np.random.choice(idx_k, int(num_idx_k*q) , replace=False))
        dict_users[k] = dict_users[k]|selected_q_data
        idx_k = list(set(idx_k) - selected_q_data)
        all_idxs = list(set(all_idxs) - selected_q_data)
        for other_group in range(num_classes):
            if other_group == k:
                continue
            selected_not_q_data = set(np.random.choice(idx_k, int(num_idx_k*(1-q)/(num_classes-1)) , replace=False))
            dict_users[other_group] = dict_users[other_group]|selected_not_q_data
            idx_k = list(set(idx_k) - selected_not_q_data)
            all_idxs = list(set(all_idxs) - selected_not_q_data)
    print(len(all_idxs),' samples are remained')
    print('random put those samples into groups')
    num_rem_each_group = len(all_idxs) // num_classes
    for i in range(num_classes):
        selected_rem_data = set(np.random.choice(all_idxs, num_rem_each_group, replace=False))
        dict_users[i] = dict_users[i]|selected_rem_data
        all_idxs = list(set(all_idxs) - selected_rem_data)
    print(len(all_idxs),' samples are remained after relocating')
    return dict_users

def non_iid_distribution_client(group_proportion, num_clients, num_classes):
    num_each_group = num_clients // num_classes
    num_data_each_client = len(group_proportion[0]) // num_each_group
    dict_users, all_idxs = {}, [i for i in range(num_data_each_client*num_clients)]
    for i in range(num_classes):
        group_data = list(group_proportion[i])
        for j in range(num_each_group):
            selected_data = set(np.random.choice(group_data, num_data_each_client, replace=False))
            dict_users[i*10+j] = selected_data
            group_data = list(set(group_data) - selected_data)
            all_idxs = list(set(all_idxs) - selected_data)
    print(len(all_idxs),' samples are remained')
    return dict_users
def check_data_each_client(dataset_label, client_data_proportion, num_client, num_classes):
    for client in client_data_proportion.keys():
        client_data = dataset_label[list(client_data_proportion[client])]
        print('client', client, 'distribution information:')
        for i in range(num_classes):
            print('class ', i, ':', len(client_data[client_data==i])/len(client_data))




In [27]:
from collections import defaultdict

In [37]:
# def mnist_noniid(dataset, num_users):
#     """
#     Sample non-I.I.D client data from MNIST dataset
#     :param dataset:
#     :param num_users:
#     :return:
#     """
#     num_shards, num_imgs = 200, 300
#     idx_shard = [i for i in range(num_shards)]
#     dict_users = {i: np.array([], dtype='int64') for i in range(num_users)}
#     idxs = np.arange(num_shards*num_imgs)
#     labels = dataset.train_labels.numpy()

#     # sort labels
#     idxs_labels = np.vstack((idxs, labels))
#     idxs_labels = idxs_labels[:,idxs_labels[1,:].argsort()]
#     idxs = idxs_labels[0,:]

#     # divide and assign
#     for i in range(num_users):
#         rand_set = set(np.random.choice(idx_shard, 2, replace=False))
#         idx_shard = list(set(idx_shard) - rand_set)
#         for rand in rand_set:
#             dict_users[i] = np.concatenate((dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
    # return dict_users
import numpy as np
from collections import defaultdict

def mnist_custom_non_iid(dataset, num_users):
    """
    Sample Non-I.I.D. client data from MNIST dataset based on specified label distribution
    :param dataset: MNIST dataset
    :param num_users: number of users
    :return: dict of image index
    """
    dict_users = defaultdict(set)

    # Get indices for each class
    idx_by_class = {label: np.where(dataset.targets == label)[0] for label in range(10)}

    # Assign labels to users
    labels_for_users = {
        0: 250,  # 1 class with 250 labels
        1: np.random.randint(20, 26),  # Random number of labels for 9 classes (20 to 25)
        2: np.random.randint(20, 26),
        3: np.random.randint(20, 26),
        4: np.random.randint(20, 26),
        5: np.random.randint(20, 26),
        6: np.random.randint(20, 26),
        7: np.random.randint(20, 26),
        8: np.random.randint(20, 26),
        9: np.random.randint(20, 26),
    }

    # Assign data to each user based on specified label distribution
    for i in range(num_users):
        for label, num_samples in labels_for_users.items():
            sampled_indices = np.random.choice(idx_by_class[label], size=num_samples, replace=False)
            dict_users[i] = dict_users[i].union(set(sampled_indices))

    return dict_users


In [1]:
def mnist_iid(dataset, num_users):
    """
    Sample I.I.D. client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return: dict of image index
    """
    num_items = int(len(dataset)/num_users)
    dict_users, all_idxs = {}, [i for i in range(len(dataset))]
    for i in range(num_users):
        dict_users[i] = set(np.random.choice(all_idxs, num_items, replace=False))
        all_idxs = list(set(all_idxs) - dict_users[i])
    return dict_users

In [4]:
if __name__ == '__main__':
    dataset_train = datasets.MNIST('../data/mnist/', train=True, download=True,
                                   transform=transforms.Compose([
                                       transforms.ToTensor(),
                                       transforms.Normalize((0.1307,), (0.3081,))
                                   ]))


In [39]:
num = 100
d = mnist_custom_non_iid(dataset_train, num)

In [25]:
def mnist_noniid(dataset, num_users):
    """
    Sample non-I.I.D client data from MNIST dataset
    :param dataset:
    :param num_users:
    :return:
    """
    num_shards, num_imgs = 400, 150
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([], dtype='int64') for i in range(num_users)}
    idxs = np.arange(num_shards*num_imgs)
    labels = dataset.train_labels.numpy()

    # sort labels
    idxs_labels = np.vstack((idxs, labels))
    idxs_labels = idxs_labels[:,idxs_labels[1,:].argsort()]
    idxs = idxs_labels[0,:]

    # divide and assign
    for i in range(num_users):
        rand_set = set(np.random.choice(idx_shard, 4, replace=False))
        idx_shard = list(set(idx_shard) - rand_set)
        for rand in rand_set:
            dict_users[i] = np.concatenate((dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
    return dict_users

In [26]:
num = 100
d = mnist_noniid(dataset_train, num)

In [3]:
dict_users = np.load('./data/non_iid_cifar.npy', allow_pickle=True).item()

In [4]:
dataset_train = datasets.MNIST('../data/mnist/', train=True, download=True,
                                transform=transforms.Compose([
                                    transforms.ToTensor(),
                                    transforms.Normalize((0.1307,), (0.3081,))
                                ]))

In [27]:
for i in range(100):
    client  = [0,0,0,0,0,0,0,0,0,0]
    for j in range(len(d[i])):
        my_list = list(d[i])
        idx = my_list[j]
        label = dataset_train[idx][1]
        client[label] += 1
    print("client: ", i , client)
    client = []

client:  0 [0, 300, 0, 0, 0, 150, 0, 150, 0, 0]
client:  1 [150, 0, 0, 0, 150, 0, 150, 150, 0, 0]
client:  2 [150, 0, 0, 0, 0, 150, 150, 150, 0, 0]
client:  3 [0, 0, 0, 150, 0, 150, 150, 0, 150, 0]
client:  4 [0, 0, 150, 0, 0, 150, 0, 0, 150, 150]
client:  5 [0, 0, 150, 0, 300, 0, 0, 150, 0, 0]
client:  6 [150, 0, 0, 0, 0, 150, 0, 150, 150, 0]
client:  7 [0, 0, 0, 150, 300, 0, 0, 0, 0, 150]
client:  8 [0, 300, 0, 150, 0, 0, 0, 0, 0, 150]
client:  9 [0, 150, 0, 150, 0, 0, 0, 150, 0, 150]
client:  10 [0, 0, 150, 150, 0, 0, 0, 150, 0, 150]
client:  11 [150, 150, 150, 0, 0, 0, 150, 0, 0, 0]
client:  12 [0, 150, 0, 0, 0, 0, 0, 300, 150, 0]
client:  13 [0, 0, 300, 0, 150, 0, 0, 150, 0, 0]
client:  14 [150, 150, 150, 0, 150, 0, 0, 0, 0, 0]
client:  15 [150, 0, 0, 150, 0, 0, 150, 0, 150, 0]
client:  16 [0, 150, 150, 0, 0, 0, 0, 0, 0, 300]
client:  17 [0, 150, 150, 0, 0, 0, 150, 0, 150, 0]
client:  18 [300, 0, 0, 300, 0, 0, 0, 0, 0, 0]
client:  19 [0, 0, 0, 0, 150, 0, 150, 0, 0, 300]
client:  2