In [157]:
#!/usr/bin/python

import numpy
import random

def split(labels, ratio_t, ratio_v, mode, shuffle=True):
    """ Splits the input labels into training, validation and testing
    and returns the corresponding indices
    Params
    ------------
    ratio_t: float [0,1]
        Ratio of training instances
    ratio_v: float[0, 1]
        Ratio of validation instances
    mode: string
        How to split the data:
            - random: Randomly assigns instances to each set. Uses all instances
            - same: All classes are assigned the same number of instances in
                each set. It is bounded by the less common class.
            - ratio: Preserves the ratio of the classes in the original set.
    respect_ratio: boolean
        Whether to respect the ratio of classes in the original dataset
    shuffle:boolean
        Whether to shuffle final indices at the end """

    _check_ratio(ratio_t)
    _check_ratio(ratio_v)

    # Get stats from classes
    vals = numpy.unique(labels)
    positions = { i: numpy.where(labels == i)[0] for i in vals}

    if mode == 'same':
        tr, v, t = _split_same(positions, ratio_t, ratio_v)
    elif mode == 'ratio':
        tr, v, t = _split_ratio(positions, ratio_t, ratio_v)
    elif mode == 'random':
        tr, v, t = _split_random(labels, ratio_t, ratio_v)
    else:
        raise ValueError('Unknown split mode {}'.format(mode))

    if shuffle is True:
        random.shuffle(tr)
        random.shuffle(v)
        random.shuffle(t)
    
    return tr, v, t


def _split_random(labels, ratio_t, ratio_v):
    """ Splits the data into sets in a random fashion """
    ct = int(numpy.floor(len(labels) * ratio_t))
    vt = int(numpy.floor(len(labels) * ratio_v))
    perm = numpy.random.permutation(len(labels))
    return perm[0:ct], perm[ct:(ct + vt)], perm[(ct + vt):]


def _split_ratio(positions, ratio_t, ratio_v):
    """ Given the positions for each class, splits data preserving
    the class ratio in the original set
    Params
    ---------
    positions: dict
        Key is the label identifier and each entry contains
        the positions in the original dataset
    ratio_t: float
        Training ratio
    ratio_v: float
        Validation ratio
    """

    train, val, test = [[]] * 3
    for i in positions.keys():
        # Split permutation so each class is split according to 
        # the original distribution
        perm = numpy.random.permutation(len(positions[i]))
        ct = int(numpy.floor(len(perm) * ratio_t))
        cv = int(numpy.floor(len(perm) * ratio_v))
        cte = len(perm) - cv - ct
        # Concat results
        train = train + positions[i][perm[0:ct]].tolist()
        val = val + positions[i][perm[ct:(ct + cv)]].tolist()
        test = test + positions[i][perm[(ct + cv):(ct + cv + cte)]].tolist()
    return numpy.asarray(train), numpy.asarray(val), numpy.asarray(test)


def _split_same(positions, ratio_t, ratio_v):
    """ Given the positions for each class, balances the class instances
    for training, validation and testing 
    Params
    ---------
    positions: dict
        Key is the label identifier and each entry contains
        the positions in the original dataset
    ratio_t: float
        Training ratio
    ratio_v: float
        Validation ratio
    """

    # Get minimum represented class
    counts = { i: len(positions[i]) for i in positions.keys()}
    per_class = min(counts.values())
    train, val, test = [[]] * 3
    # Split data so each class is taken 'per_class' instances
    ct = int(numpy.ceil(per_class * ratio_t))
    cv = int(numpy.ceil(per_class * ratio_v))
    cte = per_class - ct - cv
    # Iterate through classes so all classes have the same
    # number of instances in each set
    for i in positions.keys():
        perm = numpy.random.permutation(len(positions[i]))
        train = train + positions[i][perm[0:ct]].tolist()
        val = val + positions[i][perm[ct:(ct + cv)]].tolist()
        test = test + positions[i][perm[(ct + cv):(ct + cv + cte)]].tolist()
    return numpy.asarray(train), numpy.asarray(val), numpy.asarray(test)


def _check_ratio(r):
    if r < 0.0 or r > 1.0:
        raise ValueError('Ratio must be between 0 and 1, both included')

def count_class_ratio(original, indices):
    """ Given a set of indices and the original set, computes the
    ratio of classes"""
    
    labels = [original[i] for i in indices]
    vals = numpy.unique(labels)
    total = float(len(labels))
    return {i: sum(labels == i)/total for i in vals}

def print_split(labels, tr, v, t):
    print('Training size: {}, Validation size: {}, Testing size: {}'.format(len(tr), len(v), len(t)))
    print('Instances per class:')
    print('---- > Training: {}'.format(count_class_ratio(labels, tr)))
    print('---- > Validation: {}'.format(count_class_ratio(labels, v)))
    print('---- > Testing: {}'.format(count_class_ratio(labels, t)))
    print('Original class ratio: {}'.format(count_class_ratio(labels, range(0, len(labels)))))

In [158]:
labels = []
for i in range(0, 5000):
    labels.append(random.randint(0, 1))

ratio_t = 0.70
ratio_v = 0.20
#labels = numpy.asarray(labels) Can be also used as an array

In [159]:
# Testing random mode
tr, v, t = split(labels, ratio_t, ratio_v, 'random', shuffle=False)
print_split(labels, tr, v, t)

Training size: 3500, Validation size: 1000, Testing size: 500
Instances per class:
---- > Training: {0: 0.50257142857142856, 1: 0.49742857142857144}
---- > Validation: {0: 0.49399999999999999, 1: 0.50600000000000001}
---- > Testing: {0: 0.50800000000000001, 1: 0.49199999999999999}
Original class ratio: {0: 0.50139999999999996, 1: 0.49859999999999999}


In [160]:
# Testing same mode
tr, v, t = split(labels, ratio_t, ratio_v, 'same', shuffle=False)
print_split(labels, tr, v, t)

Training size: 3492, Validation size: 998, Testing size: 496
Instances per class:
---- > Training: {0: 0.5, 1: 0.5}
---- > Validation: {0: 0.5, 1: 0.5}
---- > Testing: {0: 0.5, 1: 0.5}
Original class ratio: {0: 0.50139999999999996, 1: 0.49859999999999999}


In [161]:
# Testing ratio mode
tr, v, t = split(labels, ratio_t, ratio_v, 'ratio')
print_split(labels, tr, v, t)

Training size: 3499, Validation size: 999, Testing size: 502
Instances per class:
---- > Training: {0: 0.50128608173763933, 1: 0.49871391826236067}
---- > Validation: {0: 0.50150150150150152, 1: 0.49849849849849848}
---- > Testing: {0: 0.50199203187250996, 1: 0.49800796812749004}
Original class ratio: {0: 0.50139999999999996, 1: 0.49859999999999999}
