In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [3]:
# load dataset
X, y = load_iris(return_X_y=True)

# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1403)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [4]:
# save evaluation set for server
ones = np.eye(3, dtype=int)

test_pd = pd.DataFrame(np.hstack((X_test, ones[y_test])))
test_pd.to_csv('./evaluation_set.csv', header=None)

In [5]:
# functions for splitting iid & non-iid

def iid_sampling(X, y, num_client):
    np.random.seed(1403)
    num_items = int(len(X)/num_client)
    dict_users, all_idxs = {}, [i for i in range(len(X))]
    for i in range(num_client):
        dict_users[i] = np.random.choice(all_idxs, num_items, replace=False)
        all_idxs = list(set(all_idxs) - set(dict_users[i]))
    return dict_users


def non_iid_sampling(X, y, num_client, num_shards, num_imgs, shards_per_client):
    assert (num_shards * num_imgs) == len(X)
    assert shards_per_client * num_client == num_shards
    np.random.seed(2024)

    # num_shards, num_imgs = 200, 300
    idx_shard = [i for i in range(num_shards)]
    dict_users = {i: np.array([], dtype=int) for i in range(num_client)}
    idxs = np.arange(num_shards*num_imgs)

    # sort labels
    idxs_labels = np.vstack((idxs, y))
    idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
    idxs = idxs_labels[0, :]

    # divide and assign 2 shards/client
    for i in range(num_client):
        rand_set = set(np.random.choice(idx_shard, shards_per_client, replace=False))
        idx_shard = list(set(idx_shard) - rand_set)
        for rand in rand_set:
            dict_users[i] = np.concatenate((dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
    return dict_users


In [6]:
def save_clients_data(X, y, user_idxs, name):
    base_path = "."
    for user in user_idxs.keys():
        X_user = X[user_idxs[user]]
        y_user = ones[y[user_idxs[user]]]
        pd_user = pd.DataFrame(np.hstack((X_user, y_user)))
        pd_user.to_csv(f'{base_path}/client_{name}_{user}.csv', header=None)

In [6]:
# iid sampling
num_clients = 1

iid_user_idxs = iid_sampling(X_train, y_train, num_clients)
save_clients_data(X_train, y_train, iid_user_idxs, 'iid-single')

In [7]:
# non-iid sampling
num_clients = 3
num_shards, num_imgs = 12, 10       # num_shards*num_imgs = 120
shards_per_client = 4

non_iid_user_idxs = non_iid_sampling(X_train, y_train, num_clients, num_shards, num_imgs, shards_per_client)
save_clients_data(X_train, y_train, non_iid_user_idxs, 'non-iid')