In [1]:
import numpy as np
import pandas as pd
import os
import random
import math
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("dataset/train.csv")

In [97]:
class DistributedDataSet:
    def __init__(self, data, seed, batch_size, clients_num):
        self.data = data
        self.seed = seed
        self.batch_size = batch_size
        self.clients_num = clients_num

        self.data_size = None
        self.n_classes = None
        self.client_datasets = {}

    def data_split(self, test_size, y_name):
        train, test = train_test_split(self.data, random_state=self.seed, test_size=test_size, stratify=self.data[y_name])
        train.index = np.arange(len(train))
        test.index = np.arange(len(test))
        return train, test

    def split_and_shuffle_labels(self, y_data, seed, amount, n_classes):
        y_data=pd.DataFrame(y_data,columns=["labels"])
        y_data["i"]=np.arange(len(y_data))
        label_dict = dict()
        for i in range(n_classes):
            var_name="label" + str(i)
            label_info=y_data[y_data["labels"]==i]
            np.random.seed(seed)
            label_info=np.random.permutation(label_info)
            label_info=label_info[0:amount]
            label_info=pd.DataFrame(label_info, columns=["labels","i"])
            label_dict.update({var_name: label_info })
        return label_dict

    def get_iid_subsamples_indices(self, label_dict, number_of_samples, batch_size, n_classes):
        sample_dict= dict()
        batch_size = int(batch_size/n_classes)
        for i in range(number_of_samples):
            sample_name="sample"+str(i)
            dumb=pd.DataFrame()
            for j in range(n_classes):
                label_name=str("label")+str(j)
                a=label_dict[label_name][i*batch_size:(i+1)*batch_size]
                dumb=pd.concat([dumb,a], axis=0)
            dumb.reset_index(drop=True, inplace=True)    
            sample_dict.update({sample_name: dumb}) 
        return sample_dict

    def create_iid_subsamples(self, sample_dict, data):
        data_li = []

        for i in range(len(sample_dict)):  ### len(sample_dict)= number of samples
            sample_name="sample"+str(i)
            indices=np.sort(np.array(sample_dict[sample_name]["i"]))
            df = data.loc[indices]
            data_li.append(df)
    
        return data_li

    def get_distributed_dataset(self, test_size, y_name):
        self.data.rename(columns={y_name: "y"}, inplace=True)
        y_name = "y"
        train, test = self.data_split(test_size, y_name)
        min_class_num = min(train[y_name].value_counts().values)
        self.n_classes = self.data[y_name].nunique()
        self.data_size = len(self.data)
        label_dict = self.split_and_shuffle_labels(train[y_name].values, self.seed, min_class_num, self.n_classes) 

        clients_labels_dict = {}
        for labels, values in label_dict.items():
            a = np.array_split(values, self.clients_num)
            client_count = 0
            for i in a:
                try:
                    clients_labels_dict[client_count][labels] = i
                except:
                    clients_labels_dict[client_count] = {}
                    clients_labels_dict[client_count][labels] = i
                client_count+=1


        for cl, cl_label_dict in clients_labels_dict.items():
            subsample_indices = self.get_iid_subsamples_indices(cl_label_dict, int((self.n_classes*min_class_num)/(self.clients_num*self.batch_size)), self.batch_size, self.n_classes)
            # print(self.data_size,1-test_size,self.clients_num,self.batch_size)
            iid_samples = self.create_iid_subsamples(subsample_indices, train)
            self.client_datasets[cl] = iid_samples

        return self.client_datasets, test

In [98]:
dd = DistributedDataSet(df, 0, 32, 4)

In [99]:
df_di, test = dd.get_distributed_dataset(0.1, 'label')

In [101]:
len(df_di[0][0])

30

In [3]:
data = df
seed = 0
batch_size = 32
clients_num = 4

In [4]:
data.rename(columns={'label': "y"}, inplace=True)
y_name = "y"
train, test = train_test_split(data, random_state=seed, test_size=0.1, stratify=data[y_name])
train.index = np.arange(len(train))
test.index = np.arange(len(test))

In [5]:
min_class_num = min(train[y_name].value_counts().values)
n_classes = data[y_name].nunique()
data_size = len(data)

In [6]:
y_data=pd.DataFrame(train['y'].values,columns=["labels"])
y_data["i"]=np.arange(len(y_data))
label_dict = dict()
for i in range(n_classes):
    var_name="label" + str(i)
    label_info=y_data[y_data["labels"]==i]
    np.random.seed(seed)
    label_info=np.random.permutation(label_info)
    label_info=label_info[0:min_class_num]
    label_info=pd.DataFrame(label_info, columns=["labels","i"])
    label_dict.update({var_name: label_info })

In [9]:
clients_labels_dict = {}
for labels, values in label_dict.items():
    a = np.array_split(values, clients_num)
    client_count = 0
    for i in a:
        try:
            clients_labels_dict[client_count][labels] = i
        except:
            clients_labels_dict[client_count] = {}
            clients_labels_dict[client_count][labels] = i
        client_count+=1

In [45]:
clients_labels_dict = {}
for labels, values in label_dict.items():
    a = np.array_split(values, clients_num)
    client_count = 0
    for i in a:
        try:
            clients_labels_dict[client_count] = pd.concat([clients_labels_dict[client_count], i])
        except:
            clients_labels_dict[client_count] = i
        client_count+=1

#     clients_labels_dict[client_count - 1] = clients_labels_dict[client_count - 1].sample(frac=1).reset_index(drop=True)

for client in clients_labels_dict.keys():
    clients_labels_dict[client] = train.iloc[clients_labels_dict[client].i].sample(frac=1).reset_index(drop=True)
    


In [52]:
len(clients_labels_dict[2])

8540

In [44]:
train.iloc[clients_labels_dict[0].i]

Unnamed: 0,y,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
20506,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23435,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23906,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28215,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14655,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30546,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28024,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20025,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27014,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
