In [13]:
import os, re, json, random, shutil, yaml

In [14]:
def get_base_names(file_list):
    # Extract base names using regex
    base_names = set()
    for filename in file_list:
        # Look for anything before _pre_ or _post_
        match = re.match(r'(.+?)_(?:pre|post)_', filename)
        if match:
            base_names.add(match.group(1))
    return list(base_names)

def split_and_move_files(source_dir, train_dir, val_dir, file_ext, base_names_train):
    # Get all files with the specified extension
    all_files = [f for f in os.listdir(source_dir) if f.endswith(file_ext)]
    
    # Move files based on their base names
    for filename in all_files:
        match = re.match(r'(.+?)_(?:pre|post)_', filename)
        if match:
            base_name = match.group(1)
            source_path = os.path.join(source_dir, filename)
            
            if base_name in base_names_train:
                dest_path = os.path.join(train_dir, filename)
            else:
                dest_path = os.path.join(val_dir, filename)
                
            shutil.copy2(source_path, dest_path)

In [15]:
jpg_source_dir = "sample_input/images"
txt_source_dir = "sample_input/labels_yolo"
jpg_train_dir = "sample_output/train/images"
jpg_val_dir = "sample_output/valid/images"
txt_train_dir = "sample_output/train/labels_yolo"
txt_val_dir = "sample_output/valid/labels_yolo"

# Create output directories if they don't exist
for dir_path in [jpg_train_dir, jpg_val_dir, txt_train_dir, txt_val_dir]:
    os.makedirs(dir_path, exist_ok=True)

In [16]:
# Get all jpg files and extract base names
jpg_files = [f for f in os.listdir(jpg_source_dir) if f.endswith('.jpg')]
base_names = get_base_names(jpg_files)

# Split base names into train/val (80/20)
TRAIN_RATIO = 0.8
random.seed(42)
random.shuffle(base_names)
split_idx = int(len(base_names) * TRAIN_RATIO)
base_names_train = set(base_names[:split_idx])
base_names_val = set(base_names[split_idx:])

# Move jpg files
split_and_move_files(jpg_source_dir, jpg_train_dir, jpg_val_dir, '.jpg', base_names_train)

# Move txt files
split_and_move_files(txt_source_dir, txt_train_dir, txt_val_dir, '.txt', base_names_train)

# Print summary
print(f"Total base names: {len(base_names)}")
print(f"Training set: {len(base_names_train)}")
print(f"Validation set: {len(base_names_val)}")

Total base names: 13
Training set: 10
Validation set: 3


In [17]:
train_path = os.path.abspath(jpg_train_dir)
val_path = os.path.abspath(jpg_val_dir)
test_path = "sample_input/cropped_jpg"

# Create yaml content
yaml_content = {
    'train': train_path,
    'val': val_path,
    'test': test_path,
    'nc': 1,
    'names': ['bld']
}

# Write yaml file
with open('data.yaml', 'w') as f:
    yaml.safe_dump(yaml_content, f, default_flow_style=False)

# Print the contents to verify
print("\nGenerated YAML content:")
print(yaml.safe_dump(yaml_content, default_flow_style=False))


Generated YAML content:
names:
- bld
nc: 1
test: sample_input/cropped_jpg
train: C:\scripts\image_preprocessing\05_prepare_dataset_split_and_config\sample_output\train\images
val: C:\scripts\image_preprocessing\05_prepare_dataset_split_and_config\sample_output\valid\images

