In [None]:
import os
from pathlib import Path
import shutil

# Define paths - checking for train/valid/test structure
# Based on your structure: dataset/healthyskin/train, dataset/healthyskin/valid, dataset/healthyskin/test
labels_dirs = []

# Check for train/valid/test splits (same directory level as notebook)
train_labels = Path("./train/labels")
valid_labels = Path("./valid/labels")
test_labels = Path("./test/labels")

if train_labels.exists():
    labels_dirs.append(train_labels)
if valid_labels.exists():
    labels_dirs.append(valid_labels)
if test_labels.exists():
    labels_dirs.append(test_labels)

# Also check old dataset structure (fallback)
dataset_labels = Path("./dataset/labels")
if dataset_labels.exists():
    labels_dirs.append(dataset_labels)

if not labels_dirs:
    print("❌ No label directories found!")
    print("Expected structure:")
    print("  ./train/labels")
    print("  ./valid/labels")
    print("  ./test/labels")
else:
    print(f"✅ Found {len(labels_dirs)} label directories to process:")
    for d in labels_dirs:
        file_count = len(list(d.glob("*.txt")))
        print(f"  - {d} ({file_count} files)")

Found 1 label directories to process
  - dataset/labels


In [5]:
def process_label_file(label_path):
    """
    Process a single label file:
    - Keep only lines with class 3 (Healthy-Skin)
    - Change class 3 to class 4
    - Remove all other classes
    
    Returns: (processed_lines, kept_count, removed_count)
    """
    processed_lines = []
    kept_count = 0
    removed_count = 0
    
    with open(label_path, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Split the line to get the class ID (first element)
        parts = line.split()
        if len(parts) < 5:  # YOLO format: class x y w h
            continue
            
        class_id = int(parts[0])
        
        # Keep only Healthy-Skin (class 3)
        if class_id == 3:
            # Change class 3 to class 4
            parts[0] = '4'
            processed_lines.append(' '.join(parts) + '\n')
            kept_count += 1
        else:
            removed_count += 1
    
    return processed_lines, kept_count, removed_count

# Test on one file first
test_file = list(labels_dirs[0].glob("*.txt"))[0] if labels_dirs[0].exists() else None
if test_file:
    print(f"Testing on: {test_file.name}")
    processed, kept, removed = process_label_file(test_file)
    print(f"  Kept {kept} Healthy-Skin annotations")
    print(f"  Removed {removed} other class annotations")
    print(f"  Processed content preview:")
    for line in processed[:3]:
        print(f"    {line.strip()}")

Testing on: erythema-migrans241_png_jpg.rf.94ea5b105f58fd5ef2610a1aedb737f1.txt
  Kept 0 Healthy-Skin annotations
  Removed 2 other class annotations
  Processed content preview:


In [6]:
def process_all_labels(labels_dirs, dry_run=True):
    """
    Process all label files in the given directories.
    
    Args:
        labels_dirs: List of label directory paths
        dry_run: If True, don't actually modify files (default: True)
    """
    total_files = 0
    total_kept = 0
    total_removed = 0
    files_with_healthy_skin = 0
    empty_files = []
    
    for labels_dir in labels_dirs:
        if not labels_dir.exists():
            print(f"Skipping non-existent directory: {labels_dir}")
            continue
            
        print(f"\nProcessing directory: {labels_dir}")
        label_files = list(labels_dir.glob("*.txt"))
        print(f"Found {len(label_files)} label files")
        
        for label_file in label_files:
            total_files += 1
            processed_lines, kept, removed = process_label_file(label_file)
            
            total_kept += kept
            total_removed += removed
            
            if kept > 0:
                files_with_healthy_skin += 1
                
                if not dry_run:
                    # Write the processed content back
                    with open(label_file, 'w') as f:
                        f.writelines(processed_lines)
            else:
                # File has no healthy skin annotations
                empty_files.append(label_file)
    
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total files processed: {total_files}")
    print(f"Files with Healthy-Skin: {files_with_healthy_skin}")
    print(f"Files without Healthy-Skin: {len(empty_files)}")
    print(f"Total Healthy-Skin annotations kept (changed to class 4): {total_kept}")
    print(f"Total other class annotations removed: {total_removed}")
    
    if empty_files:
        print(f"\nFiles with no Healthy-Skin annotations (first 10):")
        for f in empty_files[:10]:
            print(f"  - {f.name}")
        if len(empty_files) > 10:
            print(f"  ... and {len(empty_files) - 10} more")
    
    if dry_run:
        print("\n⚠️  DRY RUN MODE - No files were modified")
        print("Set dry_run=False to apply changes")
    else:
        print("\n✅ Files have been updated!")
    
    return empty_files

# Run in dry-run mode first
empty_files = process_all_labels(labels_dirs, dry_run=True)


Processing directory: dataset/labels
Found 2787 label files

SUMMARY
Total files processed: 2787
Files with Healthy-Skin: 321
Files without Healthy-Skin: 2466
Total Healthy-Skin annotations kept (changed to class 4): 1053
Total other class annotations removed: 7666

Files with no Healthy-Skin annotations (first 10):
  - erythema-migrans241_png_jpg.rf.94ea5b105f58fd5ef2610a1aedb737f1.txt
  - t-eczema-nummular-150_jpg.rf.fe46636f91eec38ffe6f4bb2c172ca4e.txt
  - sjs-ten-ocular-27__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd_jpg.rf.86b054f2bf30b15cc2eb724a28e6ac7c.txt
  - shutterstock_484652500-scaled-Copy_jpg.rf.fb16379e1d60b6f3a4cfe0782fb6977a.txt
  - 05keratosisPilaris010120_jpeg_jpg.rf.f2caf77b5cb9aa8139f7418bb4b40e53.txt
  - viral-wart-22__WatermarkedWyJXYXRlcm1hcmtlZCJd-1-Copy_jpg.rf.811fd079b974c99f5e7541a30ab7056c.txt
  - KP-9-_jpg.rf.97c005e335903eb4e0f300ba5d78e854.txt
  - t-warts-32_jpg.rf.ef33dbdbc399394e36710036b855a6c7.txt
  - erythema-migrans264_jpg.rf.de3856292279

In [7]:
# EXECUTE THIS CELL TO APPLY THE CHANGES
# Uncomment the line below to actually modify the files

empty_files = process_all_labels(labels_dirs, dry_run=False)


def remove_empty_files(empty_label_files):
    """Remove label files and corresponding images that have no healthy skin"""
    for label_file in empty_label_files:
        # Remove label file
        print(f"Removing: {label_file}")
        label_file.unlink()
        
        # Find and remove corresponding image
        image_dir = label_file.parent.parent / "images"
        image_name = label_file.stem  # filename without .txt
        
        for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
            image_file = image_dir / f"{image_name}{ext}"
            if image_file.exists():
                print(f"Removing: {image_file}")
                image_file.unlink()
                break

# Uncomment to remove files without healthy skin
remove_empty_files(empty_files)


Processing directory: dataset/labels
Found 2787 label files

SUMMARY
Total files processed: 2787
Files with Healthy-Skin: 321
Files without Healthy-Skin: 2466
Total Healthy-Skin annotations kept (changed to class 4): 1053
Total other class annotations removed: 7666

Files with no Healthy-Skin annotations (first 10):
  - erythema-migrans241_png_jpg.rf.94ea5b105f58fd5ef2610a1aedb737f1.txt
  - t-eczema-nummular-150_jpg.rf.fe46636f91eec38ffe6f4bb2c172ca4e.txt
  - sjs-ten-ocular-27__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd_jpg.rf.86b054f2bf30b15cc2eb724a28e6ac7c.txt
  - shutterstock_484652500-scaled-Copy_jpg.rf.fb16379e1d60b6f3a4cfe0782fb6977a.txt
  - 05keratosisPilaris010120_jpeg_jpg.rf.f2caf77b5cb9aa8139f7418bb4b40e53.txt
  - viral-wart-22__WatermarkedWyJXYXRlcm1hcmtlZCJd-1-Copy_jpg.rf.811fd079b974c99f5e7541a30ab7056c.txt
  - KP-9-_jpg.rf.97c005e335903eb4e0f300ba5d78e854.txt
  - t-warts-32_jpg.rf.ef33dbdbc399394e36710036b855a6c7.txt
  - erythema-migrans264_jpg.rf.de3856292279

In [None]:
import os
from pathlib import Path

# Count files in all splits
splits = ['train', 'test', 'valid']
total_labels = 0
total_images = 0

print("File counts by split:")
print("=" * 50)

for split in splits:
    labels_path = Path(f'./{split}/labels')
    images_path = Path(f'./{split}/images')
    
    if labels_path.exists() and images_path.exists():
        label_count = len([f for f in os.listdir(labels_path) if os.path.isfile(os.path.join(labels_path, f))])
        image_count = len([f for f in os.listdir(images_path) if os.path.isfile(os.path.join(images_path, f))])
        
        print(f"{split.upper()}:")
        print(f"  Labels: {label_count}")
        print(f"  Images: {image_count}")
        print(f"  Match: {'✅' if label_count == image_count else '❌'}")
        
        total_labels += label_count
        total_images += image_count
    else:
        print(f"{split.upper()}: Not found")

print("=" * 50)
print(f"TOTAL:")
print(f"  Labels: {total_labels}")
print(f"  Images: {total_images}")
print(f"  Match: {'✅' if total_labels == total_images else '❌'}")

Labels count: 321
Images count: 321


In [None]:
import os
from pathlib import Path

# Process all splits: train, test, valid
splits = ['train', 'test', 'valid']
prefix = "00000"  # Change this prefix as needed

for split in splits:
    labels_path = Path(f'./{split}/labels')
    images_path = Path(f'./{split}/images')
    
    for folder in [labels_path, images_path]:
        if folder.exists():
            print(f"Renaming files in: {folder}")
            
            files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
            for filename in files:
                old_path = os.path.join(folder, filename)
                new_filename = prefix + filename
                new_path = os.path.join(folder, new_filename)
                
                os.rename(old_path, new_path)
            
            print(f"  ✅ Renamed {len(files)} files")
        else:
            print(f"  ❌ Path not found: {folder}")

print("\nDone renaming all files!")

Renaming files in: /home/ksan/Documents/my-projects/thesis/segmentation-fr/segmentation-thermal-burns/dataset/healthyskin/dataset/labels
Done.
Renaming files in: /home/ksan/Documents/my-projects/thesis/segmentation-fr/segmentation-thermal-burns/dataset/healthyskin/dataset/images
Done.
