# test 데이터셋 분리하기

validation에서 val : test = 3 : 2 정도로 분리하려 합니다.

라벨은 분리하지 않아도 됩니다.


In [1]:
import os
import random
import re

from pathlib import Path
from pprint import pprint
from typing import (
    Optional, Sequence, Union
)


random.seed()


In [4]:
def split_testdata(
    source: Path, destination: Path, ratio: float, dryrun: bool = False
) -> Sequence[Path]:
    '''이동에 실패한 파일의 목록을 반환합니다.'''
    p_imgdir = re.compile(r'^[0-9]{2}')

    fail: Sequence[Path] = []
    for stem, branches, leaves in os.walk(source):
        if not branches:
            continue
        if not p_imgdir.search(branches[0]):
            continue
        # 옮길 폴더의 수를 정합니다.
        k = round(ratio * len(branches))
        tomove = random.sample(
            [stem / Path(branch) for branch in branches], k=k
        )

        try:
            for dirpath in tomove:
                dst = destination / dirpath.relative_to(source)
                if dryrun:
                    print('from:', dirpath)
                    print('to:', dst.parent)
                    print('    ', dst, end='\n\n')
                    continue
                dst.parent.mkdir(parents=True, exist_ok=True)
                dirpath.rename(dst)
        except OSError as why:
            print(f'FAIL: "{str(dirpath)}" ({why})')

## dry run


In [5]:
split_testdata(
    Path('./dataset/validation'), Path('./dataset/test'), 0.4, dryrun=True
)

from: dataset/validation/의류/외투/18_X001_C039_1103
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X001_C039_1103

from: dataset/validation/의류/외투/18_X003_C318_1116_6
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X003_C318_1116_6

from: dataset/validation/의류/외투/18_X003_C172_1125
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X003_C172_1125

from: dataset/validation/의류/외투/18_X390_C900_1209
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X390_C900_1209

from: dataset/validation/의류/외투/18_X022_C203_1123
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X022_C203_1123

from: dataset/validation/의류/외투/18_X010_C162_1211
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X010_C162_1211

from: dataset/validation/의류/외투/18_X005_C015_1116
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X005_C015_1116

from: dataset/validation/의류/외투/18_X023_C044_1105_7
to: dataset/test/의류/외투
     dataset/test/의류/외투/18_X023_C044_1105_7

from: dataset/validation/의류/외투/18_X013_C166_1125
to: dataset/test/의류/외투


## wet run


In [None]:
fail = split_testdata(
    Path('./dataset/validation'), Path('./dataset/test'), 0.4
)