# Creating Train Test Sets From Classed Images
> Author: Hannan Khan  
> Last Updated: 2022-04-14 16:49:21

This notebook converts this:
```
src_dir
----class1
    ----image1
    ----image2
    ----...
----class2
    ----image1
    ----image2
    ----...
----...
```
To this:
```
src_dir
----train_dir
    ----class1
        ----image1
        ----image5
        ----...
    ----class2
        ----image6
        ----image28
        ----...
----test_dir
    ----class1
        ----image14
        ----image54
        ----...
    ----class2
        ----image63
        ----image283
        ----...
```

The decision to put an image in either the train or test dir is given by the mask (generated by the ```seed```). This requires all classes to have the same number of images initially. The 1's in the mask denote a train set designation. The train set proportion can be set by settting ```train_size```.

In [8]:
import os
import pprint
import math
import numpy as np

In [9]:
src_dir = r'D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations'
train_dir = os.sep.join([src_dir, 'train'])
test_dir = os.sep.join([src_dir, 'test'])

# Making sure each class has the same number of images.

In [10]:
classes = {}
num_imgs_in_each_cls = None

for cls in os.listdir(src_dir):
    # if the class is a folder
    if os.path.isdir(os.sep.join([src_dir, cls])):
        # count num images in class
        num_imgs_in_cls = 0
        for img in os.listdir(os.sep.join([src_dir, cls])):
            if img.lower().endswith('.jpeg') or img.lower().endswith('png'):
                num_imgs_in_cls += 1
        classes[cls] = num_imgs_in_cls

print("Class with num images:")
pprint.pprint(classes)

values = list(classes.values())
if all(ele == values[0] for ele in values):
    print("All classes have the same number of images.")
    num_imgs_in_each_cls = values[0]
else:
    print("Some classes do not have the same number of images as others. Please fix before continuing.")
    num_imgs_in_each_cls = None

Class with num images:
{'0': 560,
 '1': 560,
 '2': 560,
 '3': 560,
 '4': 560,
 '5': 560,
 '6': 560,
 '7': 560,
 '8': 560,
 '9': 560}
All classes have the same number of images.


# Creating A Mask

In [11]:
seed = 42
np.random.seed(seed)
train_size = 0.80
num_train_imgs = math.floor(train_size*num_imgs_in_each_cls)

mask = np.array([0] * (num_imgs_in_each_cls-num_train_imgs) + [1] * num_train_imgs)
np.random.shuffle(mask)
print("Train Mask Generated:")
print(mask)
print("Number of train images per class:", str((mask == 1).sum()))
print("Number of test images per class:", str((mask == 0).sum()))

Train Mask Generated:
[1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 0 1 1 1 1
 1 0 1 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1
 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0
 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 1 1
 0 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 

# Creating Train/Test Folders

In [12]:
# create train folder:
train_dir = os.sep.join([src_dir, 'train'])
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
for cls in list(classes.keys()):
    cls_path = os.sep.join([train_dir, cls])
    if not os.path.exists(cls_path):
        os.mkdir(cls_path)

# create test folder:
test_dir = os.sep.join([src_dir, 'test'])
if not os.path.exists(test_dir):
    os.mkdir(test_dir)
for cls in list(classes.keys()):
    cls_path = os.sep.join([test_dir, cls])
    if not os.path.exists(cls_path):
        os.mkdir(cls_path)

# Moving Images To Appropriate Folder

In [13]:
for cls in list(classes.keys()):
    cls_src_path = os.sep.join([src_dir, cls])
    for i,img_name in enumerate(os.listdir(cls_src_path)):
        img_src_path = os.sep.join([cls_src_path, img_name])
        if mask[i]:
            # move to train folder if mask[i] is 1
            img_dest_path = os.sep.join([train_dir, cls, img_name])
            os.rename(img_src_path, img_dest_path)
            print(img_src_path, "Moved to\n\t\t", img_dest_path)
        else:
            # move to test folder if mask[i] is 0
            img_dest_path = os.sep.join([test_dir, cls, img_name])
            os.rename(img_src_path, img_dest_path)
            print(img_src_path, "Moved to\n\t\t", img_dest_path)

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0\0.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\0\0.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0\1.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\0\1.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0\10.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\0\10.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0\100.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\0\100.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0\101.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\0\101.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1\1106.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\1\1106.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1\1107.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\1\1107.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1\1108.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\1\1108.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1\1109.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\1\1109.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1\1110.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\1\1110.png
D:\Datasets\Ishihara_Numbers\aug

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2\1382.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\2\1382.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2\1383.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\2\1383.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2\1384.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\2\1384.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2\1385.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\2\1385.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2\1386.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\2\1386.png
D:\Datasets\Ishihara_Numbers\au

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3\2078.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\3\2078.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3\2079.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\3\2079.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3\2080.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\3\2080.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3\2081.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\3\2081.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3\2082.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\3\2082.png
D:\Datasets\Ishihara_Numbers\au

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4\2786.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\4\2786.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4\2787.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\4\2787.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4\2788.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\4\2788.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4\2789.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\4\2789.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4\2790.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\4\2790.png
D:\Datasets\Ishihara_Numbers\aug

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6\3469.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\6\3469.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6\3470.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\6\3470.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6\3471.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\6\3471.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6\3472.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\6\3472.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6\3473.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\6\3473.png
D:\Datasets\Ishihara_Numbers\au

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7\4172.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\7\4172.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7\4173.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\7\4173.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7\4174.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\7\4174.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7\4175.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\7\4175.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7\4176.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\7\4176.png
D:\Datasets\Ishihara_Numbers\au

D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8\4897.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\8\4897.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8\4898.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\8\4898.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8\4899.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\8\4899.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8\4900.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\test\8\4900.png
D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8\4901.png Moved to
		 D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\train\8\4901.png
D:\Datasets\Ishihara_Numbers\au

# Delete Now Empty Class Directories

In [14]:
for folder in os.listdir(src_dir):
    folder_dir = os.sep.join([src_dir, folder])
    if not os.listdir(folder_dir):
        os.rmdir(folder_dir)
        print("Deleted:", folder_dir)

Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\0
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\1
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\2
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\3
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\4
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\5
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\6
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\7
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\8
Deleted: D:\Datasets\Ishihara_Numbers\augmented_numbers_imgs_128_noShiftAugmentations\9
