# Shuffling Methods

test cases

In [1]:
test_inputs = ['abc', 'def', 'egh', 'ijk', 'lmn', 'opq', 'rst', 'ade', 'des', 'asd', 'a', 'sd', 'sda', 'e']
test_targets = [1, 2, 1, 1, 2, 1, 2, 3, 4, 3, 1, 4, 2, 3]
expected_output = [
    ['abc', 'egh', 'ijk'],
    ['def', 'lmn', 'rst'],
    ['ade', 'asd', 'e'],
    ['opq', 'a', 'sda'],
    ['des', 'sd']
]

implementation

ideas for shuffle one class:
- sort the list by class, take as many pairs for the batches as possible
- build a dictionary with the classes and take batches from them

In [2]:
def build_map_classes_buffer(inputs, targets):
    '''
    Build a dictionary which has the targets as keys and all the corresponding inputs as values
    -- Params:
    @inputs: input data for the model
    @targets: desired outcome for the model given that it gets @inputs
    -- Return: dictionary
    '''
    # build a dictionary to grab pairs from
    map_classes_buffer = defaultdict(list) # stores class -> inputs pairs
    list(map(lambda x, y: map_classes_buffer[y].append(x), inputs, targets)) #without list, the writes to the map are not commited
    return map_classes_buffer

In [3]:
from collections import defaultdict
import numpy as np

def batches_one_class(inputs, targets, batch_size=64):
    '''
    Splits the data (inputs and targets) into as many homogeneous batches as possible,
    i.e. that in as many cases as possible, the batches consist of one target only.
    -- Params:
    @inputs: input data for the model
    @targets: desired outcome for the model given that it gets @inputs
    @batch_size: number of samples per batch
    -- Return: 2d list
    '''
    
    assert len(inputs) == len(targets), 'Inputs and targets do not have the same size'
    
    batches = []
    inputs_buffer = [] # store inputs that don't fit into homogeneous batches anymore
    
    num_batches = np.ceil(len(inputs) / batch_size)
    map_classes_buffer = build_map_classes_buffer(inputs, targets)
    
    for target in map_classes_buffer.keys():
        while len(map_classes_buffer[target]) > 0:
            taken_inputs, retrieved_inputs = map_classes_buffer[target][:batch_size], map_classes_buffer[target][batch_size:]
            if len(taken_inputs) < batch_size:
                inputs_buffer.extend(taken_inputs)
            else:
                batches.append(taken_inputs)
            map_classes_buffer[target] = retrieved_inputs
            
    while len(inputs_buffer) > 0:
        taken_inputs, inputs_buffer = inputs_buffer[:batch_size], inputs_buffer[batch_size:]
        batches.append(taken_inputs)
                
    assert len(batches) == num_batches, 'Error in implementation, number of batches is wrong!'
    
    return batches
    
    
actual_output = shuffle_one_class(test_inputs, test_targets, batch_size=3)
assert actual_output == expected_output, 'Actual and expected outputs differ!'

NameError: name 'shuffle_one_class' is not defined

In [None]:
def shuffle_all_classes(inputs, targets):
    pass

---

test on real data

In [None]:
from utils.data_utils import DataLoader
dataloader = DataLoader(batch_sizes={'train': 64, 'test': 64}, shuffle={'train': False, 'test': False})
dataloader.download_cifar()
trainloader, testloader = dataloader.get_loaders()

for batch_idx, (inputs, targets) in enumerate(testloader):
    _inputs = inputs
    _targets = targets
    break