In [None]:
from pathlib import Path
import shutil

# Setup Configuration

In [None]:
SEED = 777

SRC_DATASET_DIR = Path('<src dataset path>')
DEST_DATASET_DIR = Path('<dest dataset path>')

# dataset 분할 설정
TEST_RATIO = 0.1
VALID_RATIO = 0.2
TRAIN_RATIO = 1 - TEST_RATIO - VALID_RATIO

# Prepare Dataset
## Scan Input Dataset

In [None]:
dataset = [path for path in SRC_DATASET_DIR.glob('**/*.jpg')]

n_total = len(dataset)
n_test = round(n_total * TEST_RATIO)
n_valid = round(n_total * VALID_RATIO)
n_train = round(n_total * TRAIN_RATIO)

print(f'dataset: total({n_total}), train({n_train}), valid({n_valid}), test({n_train})')

## Split dataset into training set, validation set, and test set

In [None]:
from sklearn.model_selection import train_test_split

train_set, valid_set, test_set = dataset, [], []
if n_test > 0:
    train_set, test_set = train_test_split(train_set, test_size=n_test, random_state=SEED)
if n_valid > 0:
    train_set, valid_set = train_test_split(train_set, test_size=n_valid, random_state=SEED)

print('train_set({}), validation_set({}), test_set({})'.format(len(train_set), len(valid_set), len(test_set)))

In [None]:
def move_files(dataset, dest_dir):
    images_dir = dest_dir / 'images'
    labels_dir = dest_dir / 'labels'
    
    images_dir.mkdir(parents=True, exist_ok=True)
    labels_dir.mkdir(parents=True, exist_ok=True)
    
    for src_path in dataset:
        src_base = src_path.parents[0]
        id = src_path.stem
        
        image_fname = id + '.jpg'
        shutil.move(src_base / image_fname, images_dir / image_fname)
        
        labels_base = src_base.parents[0] / 'labels' if src_base.name == 'images' else src_base
        label_fname = id + '.txt'
        shutil.move(labels_base / label_fname, labels_dir / label_fname)

if len(train_set) > 0:
    move_files(train_set, DEST_DATASET_DIR / 'train')

if len(valid_set) > 0:
    move_files(valid_set, DEST_DATASET_DIR / 'valid')

if len(test_set) > 0:
    move_files(test_set, DEST_DATASET_DIR / 'test')