In [1]:
import os
import shutil
import random
from pathlib import Path

In [7]:
def split_dataset(input_dir, output_dir, val_ratio=0.1, seed=42):
    """
    Splits files in `input_dir` into `train` and `val` folders inside `output_dir`.

    Args:
        input_dir (str or Path): Path to the folder containing files to split.
        output_dir (str or Path): Destination root folder for `train` and `val` subfolders.
        val_ratio (float): Fraction of files to use for validation (default 0.1).
        seed (int): Random seed for reproducibility.
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    train_dir = output_dir / "train"
    val_dir = output_dir / "val"

    # Create output directories
    train_dir.mkdir(parents=True, exist_ok=True)
    val_dir.mkdir(parents=True, exist_ok=True)

    # List files
    files = [f for f in input_dir.iterdir() if f.is_file()]
    random.seed(seed)
    random.shuffle(files)

    val_count = int(len(files) * val_ratio)
    val_files = files[:val_count]
    train_files = files[val_count:]

    # Copy files
    for f in train_files:
        shutil.move(f, train_dir / f.name)
    for f in val_files:
        shutil.move(f, val_dir / f.name)

    print(f"Split complete: {len(train_files)} train / {len(val_files)} val")

In [8]:
# Example usage
split_dataset(
    input_dir="../datasets/anime/galData",
    output_dir="../datasets/anime/galData_val",
    val_ratio=0.1
)

Split complete: 4399 train / 488 val


In [9]:
# Example usage
split_dataset(
    input_dir="../datasets/anime/frameData",
    output_dir="../datasets/anime/frameData_val",
    val_ratio=0.1
)

Split complete: 503 train / 55 val


In [10]:
# Example usage
split_dataset(
    input_dir="../datasets/anime/illustrateData",
    output_dir="../datasets/anime/illustrateData_val",
    val_ratio=0.1
)

Split complete: 694 train / 77 val
