In [4]:
import os
import shutil
from sklearn.model_selection import KFold, train_test_split

base_path = r'D:\\project_geo\\code_test_combined\\organized32_32'
output_dir = r'D:\\project_geo\\code_test_combined\\5folds_with_test_updated_combined_ver3'

os.makedirs(output_dir, exist_ok=True)

file_paths = []

# Iterate over class folders and collect file paths
for root, dirs, files in os.walk(base_path):
    for file_name in files:
        if file_name.endswith('.png'):
            file_paths.append(os.path.join(root, file_name))

# Extract 10% of the data for testing
train_val_paths, test_paths = train_test_split(file_paths, test_size=0.1, random_state=42)

# Create the test set directory
test_dir = os.path.join(output_dir, 'test')
os.makedirs(test_dir, exist_ok=True)
for path in test_paths:
    label = os.path.basename(os.path.dirname(path))
    dest_dir = os.path.join(test_dir, label)
    os.makedirs(dest_dir, exist_ok=True)
    shutil.copy(path, dest_dir)

# Create the 5-fold sets from the remaining data
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for set_number, (train_idx, val_idx) in enumerate(kf.split(train_val_paths), 1):
    set_dir = os.path.join(output_dir, f'set_{set_number}')
    train_dir = os.path.join(set_dir, 'train')
    val_dir = os.path.join(set_dir, 'val')

    # Create directories for train and val within the set
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)

    # Copy validation files to the val directory
    for idx in val_idx:
        src_path = train_val_paths[idx]
        label = os.path.basename(os.path.dirname(src_path))
        dest_dir = os.path.join(val_dir, label)
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(src_path, dest_dir)

    # Copy training files to the train directory
    for idx in train_idx:
        src_path = train_val_paths[idx]
        label = os.path.basename(os.path.dirname(src_path))
        dest_dir = os.path.join(train_dir, label)
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(src_path, dest_dir)

    print(f"Set {set_number} created with validation set from fold {set_number} and training set from other folds.")

print("5-fold created with a separate test set.")


Set 1 created with validation set from fold 1 and training set from folds 2~5.
Set 2 created with validation set from fold 2 and training set from folds 2~5.
Set 3 created with validation set from fold 3 and training set from folds 2~5.
Set 4 created with validation set from fold 4 and training set from folds 2~5.
Set 5 created with validation set from fold 5 and training set from folds 2~5.
5-fold cross-validation datasets created with a separate test set.
