Source: [link](https://github.com/cs230-stanford/cs230-code-examples/blob/master/tensorflow/vision/build_dataset.py)
### TODO
- DONE Create py script with argparse 
- DONE Handle minority classes > 10
- DONE Test opencv vs pillow (opencv just barely faster)

In [166]:
import argparse
import random
import os
import cv2
import numpy as np

from math import ceil, floor
from pathlib import Path
# from tqdm import tqdm
from tqdm.notebook import tqdm

In [167]:
data_dir = 'data/cifar'
output_dir = "data/output"
size = 32
seed = 230
splits = [0.7, 0.15, 0.15]
train_split = splits[0]
test_split = splits[0]+splits[1]

random.seed(seed)
assert sum(splits) == 1.0, 'Sum of splits must be 1.'

In [168]:
def resize_and_save(filename, output_dir, size=size):
    """Resize the image contained in `filename` and save it to the `output_dir`"""
    image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    # Use bilinear interpolation instead of the default "nearest neighbor" method
    image = cv2.resize(image, (size, size), interpolation=cv2.INTER_LINEAR)
    path = os.path.join(output_dir, filename.split('/')[-1])
    cv2.imwrite(path, image)

In [169]:
class_names = os.listdir(data_dir)
num_classes = len(class_names)
num_samples = len(list(Path(data_dir).glob('*/*')))

image_folders = [os.path.join(data_dir, class_name) for class_name in class_names]

# Create folders for train, test and val with subfolders
train_folders = [os.path.join(output_dir, 'train', class_name) for class_name in class_names]
test_folders = [os.path.join(output_dir, 'test', class_name) for class_name in class_names]
val_folders = [os.path.join(output_dir, 'val', class_name) for class_name in class_names]

# Create all folders
[Path(train_folder).mkdir(parents=True, exist_ok=True) for train_folder in train_folders]
[Path(test_folder).mkdir(parents=True, exist_ok=True) for test_folder in test_folders]
[Path(val_folder).mkdir(parents=True, exist_ok=True) for val_folder in val_folders];

In [170]:
tqdm_img = tqdm(total=num_samples, desc='Images', position=0)

# Iterate over categories
for idx, directory in enumerate(image_folders):
    
    random.seed(seed)
    # Get filenames in category, sort and shuffle (for reproducible split)
    filenames = os.listdir(directory)
    filenames.sort()
    random.shuffle(filenames)
    num_samples = len(filenames)
    
    # Calculate number of samples for each dataset
    # NB: minimum 4 samples to get one in each split
    filenames = np.array(filenames)
    filenames_split = np.split(filenames, [floor(num_samples*train_split), floor(num_samples*test_split)])
    
    # Split dataset into train test val
    ds = {'train': filenames_split[0],
          'test': filenames_split[1],
          'val': filenames_split[2]}
    
    # Copy files to correct folder/split
    for split in ds:
        output = os.path.join(output_dir, split, directory.split("/")[-1])
        for filename in ds[split]:
            filename = os.path.join(directory, filename)
            resize_and_save(filename, output, size=size)
            tqdm_img.update(1)

print ("Done building dataset.")

HBox(children=(FloatProgress(value=0.0, description='Images', max=10000.0, style=ProgressStyle(description_wid…

Done building dataset.
