In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

[[ 11  12  13  14]
 [ 21  22  23  24]
 [ 31  32  33  34]
 [ 41  42  43  44]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [ 81  82  83  84]
 [ 91  92  93  94]
 [101 102 103 104]]
 
[False False False False False  True  True  True  True  True]


## Sklearn

In [7]:
partitions = [.8,.1,.1]

train_data,testTMP_data, train_labels,testTMP_labels = \
                   train_test_split(fakedata, fakelabels, train_size=partitions[0])

split = partitions[1] / np.sum(partitions[1:])
print(f'Split : {split}')

devset_data,test_data, devset_labels,test_labels = \
              train_test_split(testTMP_data, testTMP_labels, train_size=split)

Split : 0.5


In [8]:
# print out the sizes
print('Training data size: ' + str(train_data.shape))
print('Devset data size: '   + str(devset_data.shape))
print('Test data size: '     + str(test_data.shape))
print(' ')

# print out the train/test data
print('Training data: ')
print(train_data)
print(' ')

print('Devset data: ')
print(devset_data)
print(' ')

print('Test data: ')
print(test_data)

Training data size: (8, 4)
Devset data size: (1, 4)
Test data size: (1, 4)
 
Training data: 
[[ 91  92  93  94]
 [101 102 103 104]
 [ 51  52  53  54]
 [ 61  62  63  64]
 [ 71  72  73  74]
 [ 31  32  33  34]
 [ 21  22  23  24]
 [ 11  12  13  14]]
 
Devset data: 
[[81 82 83 84]]
 
Test data: 
[[41 42 43 44]]


## Numpy

In [9]:
# partition sizes in proportion
partitions = np.array([.8,.1,.1])
print('Partition proportions:')
print(partitions)
print(' ')

# convert those into integers
partitionBnd = np.cumsum(partitions*len(fakelabels)).astype(int)
print('Partition boundaries:')
print(partitionBnd)
print(' ')


# random indices
randindices = np.random.permutation(range(len(fakelabels)))
print('Randomized data indices:')
print(randindices)
print(' ')

Partition proportions:
[0.8 0.1 0.1]
 
Partition boundaries:
[ 8  9 10]
 
Randomized data indices:
[9 3 7 8 4 0 6 5 1 2]
 


In [10]:
# select rows for the training data
train_dataN   = fakedata[randindices[:partitionBnd[0]],:]
train_labelsN = fakelabels[randindices[:partitionBnd[0]]]

# select rows for the devset data
devset_dataN   = fakedata[randindices[partitionBnd[0]:partitionBnd[1]],:]
devset_labelsN = fakelabels[randindices[partitionBnd[0]:partitionBnd[1]]]

# select rows for the test data
test_dataN   = fakedata[randindices[partitionBnd[1]:],:]
test_labelsN = fakelabels[randindices[partitionBnd[1]:]]

In [11]:
# print out the sizes
print('Training data size: ' + str(train_dataN.shape))
print('Devset size: '        + str(devset_dataN.shape))
print('Test data size: '     + str(test_dataN.shape))
print(' ')

# print out the train/test data
print('Training data: ')
print(train_dataN)
print(' ')

print('Devset data: ')
print(devset_dataN)
print(' ')

print('Test data: ')
print(test_dataN)

Training data size: (8, 4)
Devset size: (1, 4)
Test data size: (1, 4)
 
Training data: 
[[101 102 103 104]
 [ 41  42  43  44]
 [ 81  82  83  84]
 [ 91  92  93  94]
 [ 51  52  53  54]
 [ 11  12  13  14]
 [ 71  72  73  74]
 [ 61  62  63  64]]
 
Devset data: 
[[21 22 23 24]]
 
Test data: 
[[31 32 33 34]]
