In [27]:
import gzip
import random
import pickle
import numpy as np
import pandas as pd

from pathlib import Path
from matplotlib import pyplot

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [2]:
def _split_and_shuffle_labels(y_data, seed):
    num_of_class = len(set(y_data.tolist()))
    y_data=pd.DataFrame(y_data, columns=['label'])
    y_data['index'] = np.arange(len(y_data))
    label_dict = dict()
    cur_idx = list()

    for i in range(num_of_class):
        var_name = 'label' + str(i)
        label_info = y_data[y_data['label'] == i]
        np.random.seed(seed)
        label_info = np.random.permutation(label_info)
        label_info = pd.DataFrame(label_info, columns=['label', 'index'])
        label_dict.update({var_name: label_info })
        cur_idx.append(0)

    return label_dict, cur_idx

In [3]:
def _get_iid_subsamples_indices(y_data, number_of_samples, seed):
    num_of_class = len(set(y_data.tolist()))
    label_dict, cur_idx = _split_and_shuffle_labels(y_data, seed)
    sample_dict = dict()
    dist = 1.0 / num_of_class
    for i in range(number_of_samples):
        sample_name = 'sample' + str(i)
        dumb = pd.DataFrame()
        for j in range(num_of_class):
            label_name = str('label') + str(j)
            if i == (number_of_samples - 1):
                next_idx = len(label_dict[label_name])
            else:
                next_idx = int(len(label_dict[label_name]) * dist)
                next_idx += cur_idx[j]
            temp = label_dict[label_name][cur_idx[j]:next_idx]
            dumb=pd.concat([dumb, temp], axis=0)
            cur_idx[j] = next_idx
        dumb.reset_index(drop=True, inplace=True)    
        sample_dict.update({sample_name: dumb}) 
    return sample_dict

In [4]:
def _get_non_iid_subsamples_indices(y_data, number_of_samples, pdist, seed):    
    num_of_class = len(set(y_data.tolist()))
    label_dict, cur_idx = _split_and_shuffle_labels(y_data, seed)
    sample_dict = dict()
    for i in range(number_of_samples):
        sample_name = 'sample' + str(i)
        dumb = pd.DataFrame()
        dist1 = pdist * (2 / 3)
        dist2 = pdist - dist1
        dist3 = (1.0 - pdist) / (num_of_class - 2)
        for j in range(num_of_class):
            label_name = str('label') + str(j)
            dist = dist1 if j == i else dist2 if (j % 5) == (i % 5) else dist3
            if i == (number_of_samples - 1):
                next_idx = len(label_dict[label_name])
            else:
                next_idx = int(len(label_dict[label_name]) * dist)
                next_idx += cur_idx[j]
            temp = label_dict[label_name][cur_idx[j]:next_idx]
            dumb = pd.concat([dumb, temp], axis=0)
            cur_idx[j] = next_idx
        dumb.reset_index(drop=True, inplace=True)    
        sample_dict.update({sample_name: dumb}) 
    return sample_dict 

In [5]:
def _create_subsamples(sample_dict, x_data, y_data, x_name, y_name):
    x_data_dict= dict()
    y_data_dict= dict()
    
    for i in range(len(sample_dict)):  ### len(sample_dict)= number of samples
        xname= x_name+str(i)
        yname= y_name+str(i)
        sample_name="sample"+str(i)
        
        indices=np.sort(np.array(sample_dict[sample_name]['index']))
        
        x_info= x_data[indices,:]
        x_data_dict.update({xname : x_info})
        
        y_info= y_data[indices]
        y_data_dict.update({yname : y_info})
        
    return x_data_dict, y_data_dict

In [293]:
def _create_corrupted_subsamples(sample_dict, x_data, y_data, x_name, y_name,
                                 cor_local_ratio=1.0, cor_label_ratio=0.2, cor_data_ratio=0.5, mode=1):
    x_data_dict= dict()
    y_data_dict= dict()
    
    # make corrupted info
    num_of_local = len(sample_dict)
    num_of_label = len(set(y_data.tolist()))
    cor_local_idx = random.sample(range(0, num_of_local), int(num_of_local * cor_local_ratio))
    cor_label_idx = random.sample(range(0, num_of_label), int(num_of_label * cor_label_ratio))
    temp = set(y_data.tolist())
    temp.difference_update(cor_label_idx)
    if mode == 1:
        temp = list(temp)
        cor_vals = random.sample(temp, int(num_of_label * cor_label_ratio))
        print(cor_label_idx, '->', cor_vals)
    else:
        print(cor_label_idx, '-> random value')
    
    for i in range(len(sample_dict)):  ### len(sample_dict)= number of samples
        xname= x_name+str(i)
        yname= y_name+str(i)
        sample_name="sample"+str(i)
        
        indices=np.sort(np.array(sample_dict[sample_name]['index']))
        
        x_info= x_data[indices,:]
        x_data_dict.update({xname : x_info})
        
        y_info= y_data[indices]
        
        if i in cor_local_idx:
            val_cnt = 0
            for j in cor_label_idx:
                temp_dices = np.where(y_info == j)[0]
                cor_data_len = int(len(temp_dices) * cor_data_ratio)
                corrupted_idx = random.sample(list(temp_dices), cor_data_len)
                
                if mode == 1:
                    y_info[corrupted_idx] = cor_vals[val_cnt]
                    val_cnt = val_cnt + 1
                else:
                    for i in corrupted_idx:
                        temp_x = temp
                        ori_val = y_info[i].item()
                        temp_x.difference_update([ori_val])
                        y_info[i] = random.sample(temp_x, 1)[0]
    
        
        y_data_dict.update({yname : y_info})
        
    return x_data_dict, y_data_dict

In [295]:
def _create_corrupted_subsamples2(sample_dict, x_data, y_data, x_name, y_name,
                                  cor_local_ratio=1.0, cor_minor_label_cnt=4,
                                  cor_major_data_ratio=0.2,
                                  cor_minor_data_ratio=0.5,
                                  mode=1):
    x_data_dict= dict()
    y_data_dict= dict()

    # make corrupted info
    num_of_local = len(sample_dict)
    num_of_label = len(set(y_data.tolist()))
    cor_local_idx = random.sample(range(0, num_of_local), int(num_of_local * cor_local_ratio))
    
    for i in range(len(sample_dict)):  ### len(sample_dict)= number of samples
        xname= x_name+str(i)
        yname= y_name+str(i)
        sample_name="sample"+str(i)
        
        indices=np.sort(np.array(sample_dict[sample_name]['index']))
        
        x_info= x_data[indices,:]
        x_data_dict.update({xname : x_info})
        
        y_info= y_data[indices]
        
        if i in cor_local_idx:
            cor_major_label_idx = list()
            cor_major_label_idx.append(i)
            cor_major_label_idx.append((i+5)%num_of_label)
            
            for j in cor_major_label_idx:
                temp_dices = np.where(y_info == j)[0]
                cor_data_len = int(len(temp_dices) * cor_major_data_ratio)
                corrupted_idx = random.sample(list(temp_dices), cor_data_len)

                ori_val = y_info[corrupted_idx][0]
                y_info[corrupted_idx] = (ori_val + 5) % num_of_label
        
            temp = set(tr_y.tolist())
            temp.difference_update(cor_major_label_idx)
            cor_minor_label_idx = random.sample(temp, cor_minor_label_cnt)
            temp.difference_update(cor_minor_label_idx)
            cor_minor_vals = random.sample(temp, cor_minor_label_cnt)
            print(cor_major_label_idx, '|', cor_minor_label_idx, '->', cor_minor_vals)
        
            val_cnt = 0
            for j in cor_minor_label_idx:
                temp_dices = np.where(y_info == j)[0]
                cor_data_len = int(len(temp_dices) * cor_minor_data_ratio)
                corrupted_idx = random.sample(list(temp_dices), cor_data_len)
                
                if mode == 1:
                    y_info[corrupted_idx] = cor_minor_vals[val_cnt]
                    val_cnt = val_cnt + 1
                else:
                    cor_minor_vals = list()
                    for i in corrupted_idx:
                        temp_x = temp
                        ori_val = y_info[i].item()
                        temp_x.difference_update([ori_val])
                        y_info[i] = random.sample(temp_x, 1)[0]
        
        y_data_dict.update({yname : y_info})
        
    return x_data_dict, y_data_dict

In [6]:
def _print_dict(x_train_dict, y_train_dict, x_test_dict, y_test_dict):
    sum = 0
    print('[*] Train Dataset (x, y)')
    for idx, (x_key, y_key) in enumerate(zip(x_train_dict, y_train_dict)):
        sum += len(x_train_dict[x_key])
        print('- sample{}: {}, {}'.format(idx, len(x_train_dict[x_key]), len(y_train_dict[y_key])))
        print(': ', end='')
        for i in range(10):
            print(y_train_dict[y_key].tolist().count(i), end=' ')
        print('')
    print('# total:', sum, end='\n\n')

    sum = 0
    print('[*] Test Dataset (x, y)')
    for idx, (x_key, y_key) in enumerate(zip(x_test_dict, y_test_dict)):
        sum += len(x_test_dict[x_key])
        print('- sample{}: {}, {}'.format(idx, len(x_test_dict[x_key]), len(y_test_dict[y_key])))
        print(': ', end='')
        for i in range(10):
            print(y_test_dict[y_key].tolist().count(i), end=' ')
        print('')
    print('# total:', sum)

In [7]:
def load_mnist_data(path='./data/mnist.pkl.gz', torch_tensor=True):
    data_path = Path(path)
    with gzip.open(data_path, "rb") as f:
        ((x_train, y_train), (x_test, y_test)) = pickle.load(f)
        
    if torch_tensor:
        x_train, y_train, x_test, y_test = map(torch.tensor, (x_train, y_train, x_test, y_test))
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    return x_train, y_train, x_test, y_test

In [8]:
def create_non_iid_samples(x_train, y_train, x_test, y_test, num_of_sample=10, pdist=0.6, seed=1, verbose=True):
    sample_dict_train = _get_non_iid_subsamples_indices(y_train, num_of_sample, pdist, seed)
    x_train_dict, y_train_dict = _create_subsamples(sample_dict_train, x_train, y_train, 'x_train', 'y_train')
    
    sample_dict_test = _get_non_iid_subsamples_indices(y_test, num_of_sample, pdist, seed)
    x_test_dict, y_test_dict = _create_subsamples(sample_dict_test, x_test, y_test, 'x_test', 'y_test')
    
    if verbose:
        _print_dict(x_train_dict, y_train_dict, x_test_dict, y_test_dict)
    return x_train_dict, y_train_dict, x_test_dict, y_test_dict

In [298]:
def create_corrupted_non_iid_samples(x_train, y_train, x_test, y_test, 
                                     cor_local_ratio=1.0,
                                     cor_minor_label_cnt=4,
                                     cor_major_data_ratio=0.2,
                                     cor_minor_data_ratio=0.5, mode=1,
                                     num_of_sample=10, pdist=0.6, seed=1, verbose=True):
    sample_dict_train = _get_non_iid_subsamples_indices(y_train, num_of_sample, pdist, seed)
    x_train_dict, y_train_dict = _create_corrupted_subsamples2(sample_dict_train, x_train, y_train,
                                                               'x_train', 'y_train',
                                                               cor_local_ratio, cor_minor_label_cnt,
                                                               cor_major_data_ratio, cor_minor_data_ratio, mode)
    
    sample_dict_test = _get_non_iid_subsamples_indices(y_test, num_of_sample, pdist, seed)
    x_test_dict, y_test_dict = _create_subsamples(sample_dict_test, x_test, y_test, 'x_test', 'y_test')
    
    if verbose:
        _print_dict(x_train_dict, y_train_dict, x_test_dict, y_test_dict)
    return x_train_dict, y_train_dict, x_test_dict, y_test_dict

In [9]:
def create_iid_samples(x_train, y_train, x_test, y_test, num_of_sample=10, seed=1, verbose=True):
    sample_dict_train = _get_iid_subsamples_indices(y_train, num_of_sample, seed)
    x_train_dict, y_train_dict = _create_subsamples(sample_dict_train, x_train, y_train, 'x_train', 'y_train')
    
    sample_dict_test = _get_iid_subsamples_indices(y_test, num_of_sample, seed)
    x_test_dict, y_test_dict = _create_subsamples(sample_dict_test, x_test, y_test, 'x_test', 'y_test')
    
    if verbose:
        _print_dict(x_train_dict, y_train_dict, x_test_dict, y_test_dict)
    return x_train_dict, y_train_dict, x_test_dict, y_test_dict

In [297]:
def create_corrupted_iid_samples(x_train, y_train, x_test, y_test,
                                 cor_local_ratio=1.0, cor_label_ratio=0.2, cor_data_ratio=0.5, mode=1,
                                 num_of_sample=10, seed=1, verbose=True):
    sample_dict_train = _get_iid_subsamples_indices(y_train, num_of_sample, seed)
    x_train_dict, y_train_dict = _create_corrupted_subsamples(sample_dict_train, x_train, y_train,
                                                              'x_train', 'y_train',
                                                              cor_local_ratio, cor_label_ratio, cor_data_ratio,
                                                              mode)
    
    sample_dict_test = _get_iid_subsamples_indices(y_test, num_of_sample, seed)
    x_test_dict, y_test_dict = _create_subsamples(sample_dict_test, x_test, y_test, 'x_test', 'y_test')
    
    if verbose:
        _print_dict(x_train_dict, y_train_dict, x_test_dict, y_test_dict)
    return x_train_dict, y_train_dict, x_test_dict, y_test_dict

In [10]:
def create_dataloader(x_train, y_train, x_test, y_test, batch_size):
    train_data = None
    test_data = None
    
    if x_train != None and y_train != None:
        train_data = DataLoader(TensorDataset(x_train, y_train), batch_size=batch_size, shuffle=True)
    if x_test != None and y_test != None:
        test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=1)
    
    return train_data, test_data

## Test

In [179]:
tr_X, tr_y, te_X, te_y = load_mnist_data()

torch.Size([60000, 1, 28, 28]) torch.Size([60000]) torch.Size([10000, 1, 28, 28]) torch.Size([10000])


In [294]:
tr_X_iid_dict, tr_y_iid_dict, te_X_iid_dict, te_y_iid_dict = create_corrupted_iid_samples(
    tr_X, tr_y, te_X, te_y,
    cor_local_ratio=1.0, cor_label_ratio=0.2, cor_data_ratio=0.5, mode=2,
    num_of_sample=10, seed=1, verbose=True
)

[1, 6] -> random value
[*] Train Dataset (x, y)
- sample0: 5996, 5996
: 674 337 689 692 655 617 296 716 656 664 
- sample1: 5996, 5996
: 666 337 677 711 664 622 296 686 657 680 
- sample2: 5996, 5996
: 661 337 679 695 651 637 296 705 675 660 
- sample3: 5996, 5996
: 664 337 671 709 648 628 296 716 667 660 
- sample4: 5996, 5996
: 672 337 656 701 657 618 296 711 671 677 
- sample5: 5996, 5996
: 677 337 668 695 669 619 296 705 659 671 
- sample6: 5996, 5996
: 667 337 675 696 671 602 296 714 676 662 
- sample7: 5996, 5996
: 662 337 685 714 665 616 296 700 659 662 
- sample8: 5996, 5996
: 667 337 697 674 675 641 296 691 649 669 
- sample9: 6036, 6036
: 665 338 681 689 670 610 300 716 672 695 
# total: 60000

[*] Test Dataset (x, y)
- sample0: 996, 996
: 98 113 103 101 98 89 95 102 97 100 
- sample1: 996, 996
: 98 113 103 101 98 89 95 102 97 100 
- sample2: 996, 996
: 98 113 103 101 98 89 95 102 97 100 
- sample3: 996, 996
: 98 113 103 101 98 89 95 102 97 100 
- sample4: 996, 996
: 98 113 1

In [301]:
tr_X_iid_dict, tr_y_iid_dict, te_X_iid_dict, te_y_iid_dict = create_corrupted_non_iid_samples(
    tr_X, tr_y, te_X, te_y,
    cor_local_ratio=1.0,
    cor_minor_label_cnt=1,
    cor_major_data_ratio=0.2,
    cor_minor_data_ratio=0.5, mode=1,
    num_of_sample=10, seed=1, verbose=True
)

[0, 5] | [6] -> [3]
[1, 6] | [8] -> [0]
[2, 7] | [6] -> [3]
[3, 8] | [4] -> [0]
[4, 9] | [8] -> [7]
[5, 0] | [9] -> [2]
[6, 1] | [4] -> [7]
[7, 2] | [5] -> [6]
[8, 3] | [0] -> [6]
[9, 4] | [8] -> [7]
[*] Train Dataset (x, y)
- sample0: 5882, 5882
: 2207 337 297 453 292 1246 148 313 292 297 
- sample1: 6243, 6243
: 442 2501 297 306 292 271 1378 313 146 297 
- sample2: 6022, 6022
: 296 337 2252 453 292 271 148 1384 292 297 
- sample3: 6020, 6020
: 442 337 297 2294 146 271 295 313 1328 297 
- sample4: 5932, 5932
: 296 337 297 306 2200 271 295 459 146 1325 
- sample5: 5781, 5781
: 1294 337 445 306 292 2058 295 313 292 149 
- sample6: 6079, 6079
: 296 1457 297 306 146 271 2258 459 292 297 
- sample7: 6083, 6083
: 296 337 1354 306 292 136 430 2343 292 297 
- sample8: 5964, 5964
: 148 337 297 1356 292 271 443 313 2210 297 
- sample9: 5994, 5994
: 298 339 305 311 1317 272 303 463 149 2237 
# total: 60000

[*] Test Dataset (x, y)
- sample0: 971, 971
: 391 56 51 50 49 178 47 51 48 50 
- sample1:

In [237]:
aaa = torch.IntTensor([1, 2, 3])
bbb = torch.IntTensor([2, 2, 2])


In [238]:
aaa = bbb

In [239]:
aaa

tensor([2, 2, 2], dtype=torch.int32)

In [256]:
data = list()
for i in range(100):
    data.append(random.sample([1, 2, 3, 4], 1)[0])
data = np.array(data)
len(np.where(data==4)[0])

29

In [185]:
sample_tr_dict = _get_non_iid_subsamples_indices(tr_y, 10, 0.9, 1)

In [188]:
sample_name="sample0"
indices=np.sort(np.array(sample_tr_dict[sample_name]['index']))

y_info= tr_y[indices]
y_info, len(y_info)

(tensor([5, 0, 0,  ..., 0, 5, 5]), 5784)

In [191]:
y_info= np.array(tr_y[indices].tolist())

In [198]:
np.where(y_info==5)[0]

array([   0,    3,    7, ..., 5779, 5782, 5783], dtype=int64)

In [213]:
cor_local_ratio = 1.0
cor_minor_label_cnt = 2
cor_major_data_ratio = 0.2
cor_minor_data_ratio = 0.5

num_of_local = len(sample_tr_dict)
num_of_label = len(set(tr_y.tolist()))
cor_local_idx = random.sample(range(0, num_of_local), int(num_of_local * cor_local_ratio))

for i in range(len(sample_tr_dict)):  ### len(sample_dict)= number of samples
#     xname= x_name+str(i)
#     yname= y_name+str(i)
#     sample_name="sample"+str(i)

#     indices=np.sort(np.array(sample_dict[sample_name]['index']))

#     x_info= x_data[indices,:]
#     x_data_dict.update({xname : x_info})

#     y_info= y_data[indices]
    if i in cor_local_idx:
        cor_major_label_idx = list()
        cor_major_label_idx.append(i)
        cor_major_label_idx.append((i+5)%num_of_label)
        print(cor_major_label_idx)
        
        for j in cor_major_label_idx:
            temp_dices = np.where(y_info == j)[0]
            cor_data_len = int(len(temp_dices) * cor_major_data_ratio)
            corrupted_idx = random.sample(list(temp_dices), cor_data_len)

            ori_val = y_info[corrupted_idx][0]
            y_info[corrupted_idx] = (ori_val + 5) % num_of_label
        
#         cor_minor_label_idx = random.sample(range(0, num_of_label), int(cor_minor_label_cnt))
        temp = set(tr_y.tolist())
        temp.difference_update(cor_major_label_idx)
        cor_minor_label_idx = random.sample(temp, cor_minor_label_cnt)
        temp.difference_update(cor_minor_label_idx)
        cor_minor_val = random.sample(temp, 1)

        val_cnt = 0
        for j in cor_minor_label_idx:
            temp_dices = np.where(y_info == j)[0]
            cor_data_len = int(len(temp_dices) * cor_minor_data_ratio)
            corrupted_idx = random.sample(list(temp_dices), cor_data_len)

            ori_val = y_info[corrupted_idx][0]
            cor_val = (ori_val + 1) % num_of_label
            y_info[corrupted_idx] = cor_minor_val

[0, 5]
[1, 6]
[2, 7]
[3, 8]
[4, 9]
[5, 0]
[6, 1]
[7, 2]
[8, 3]
[9, 4]


In [211]:
aaa = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
aaa = set(aaa)
aaa.difference_update([0, 5])
minor_label = random.sample(aaa, 4)
print(minor_label)
aaa.difference_update(minor_label)
aaa = list(aaa)
aaa

[4, 3, 1, 6]


[2, 7, 8, 9]