In [None]:
import os
from pathlib import Path
import shutil

# Define paths
dataset_root = Path("./dataset")
labels_dirs = [
    dataset_root / "labels",
]

# Also check for train/valid/test structure
train_labels = Path("../train/labels")
valid_labels = Path("../valid/labels")
test_labels = Path("../test/labels")

if train_labels.exists():
    labels_dirs.append(train_labels)
if valid_labels.exists():
    labels_dirs.append(valid_labels)
if test_labels.exists():
    labels_dirs.append(test_labels)

print(f"Found {len(labels_dirs)} label directories to process")
for d in labels_dirs:
    if d.exists():
        print(f"  - {d}")

helloworld


In [None]:
def process_label_file(label_path):
    """
    Process a single label file:
    - Keep only lines with class 3 (Healthy-Skin)
    - Change class 3 to class 4
    - Remove all other classes
    
    Returns: (processed_lines, kept_count, removed_count)
    """
    processed_lines = []
    kept_count = 0
    removed_count = 0
    
    with open(label_path, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Split the line to get the class ID (first element)
        parts = line.split()
        if len(parts) < 5:  # YOLO format: class x y w h
            continue
            
        class_id = int(parts[0])
        
        # Keep only Healthy-Skin (class 3)
        if class_id == 3:
            # Change class 3 to class 4
            parts[0] = '4'
            processed_lines.append(' '.join(parts) + '\n')
            kept_count += 1
        else:
            removed_count += 1
    
    return processed_lines, kept_count, removed_count

# Test on one file first
test_file = list(labels_dirs[0].glob("*.txt"))[0] if labels_dirs[0].exists() else None
if test_file:
    print(f"Testing on: {test_file.name}")
    processed, kept, removed = process_label_file(test_file)
    print(f"  Kept {kept} Healthy-Skin annotations")
    print(f"  Removed {removed} other class annotations")
    print(f"  Processed content preview:")
    for line in processed[:3]:
        print(f"    {line.strip()}")

In [None]:
def process_all_labels(labels_dirs, dry_run=True):
    """
    Process all label files in the given directories.
    
    Args:
        labels_dirs: List of label directory paths
        dry_run: If True, don't actually modify files (default: True)
    """
    total_files = 0
    total_kept = 0
    total_removed = 0
    files_with_healthy_skin = 0
    empty_files = []
    
    for labels_dir in labels_dirs:
        if not labels_dir.exists():
            print(f"Skipping non-existent directory: {labels_dir}")
            continue
            
        print(f"\nProcessing directory: {labels_dir}")
        label_files = list(labels_dir.glob("*.txt"))
        print(f"Found {len(label_files)} label files")
        
        for label_file in label_files:
            total_files += 1
            processed_lines, kept, removed = process_label_file(label_file)
            
            total_kept += kept
            total_removed += removed
            
            if kept > 0:
                files_with_healthy_skin += 1
                
                if not dry_run:
                    # Write the processed content back
                    with open(label_file, 'w') as f:
                        f.writelines(processed_lines)
            else:
                # File has no healthy skin annotations
                empty_files.append(label_file)
    
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total files processed: {total_files}")
    print(f"Files with Healthy-Skin: {files_with_healthy_skin}")
    print(f"Files without Healthy-Skin: {len(empty_files)}")
    print(f"Total Healthy-Skin annotations kept (changed to class 4): {total_kept}")
    print(f"Total other class annotations removed: {total_removed}")
    
    if empty_files:
        print(f"\nFiles with no Healthy-Skin annotations (first 10):")
        for f in empty_files[:10]:
            print(f"  - {f.name}")
        if len(empty_files) > 10:
            print(f"  ... and {len(empty_files) - 10} more")
    
    if dry_run:
        print("\n⚠️  DRY RUN MODE - No files were modified")
        print("Set dry_run=False to apply changes")
    else:
        print("\n✅ Files have been updated!")
    
    return empty_files

# Run in dry-run mode first
empty_files = process_all_labels(labels_dirs, dry_run=True)

In [None]:
# EXECUTE THIS CELL TO APPLY THE CHANGES
# Uncomment the line below to actually modify the files

# empty_files = process_all_labels(labels_dirs, dry_run=False)

# Optional: Remove images and labels that have no healthy skin
# Uncomment below if you want to clean up files without healthy skin annotations

# def remove_empty_files(empty_label_files):
#     """Remove label files and corresponding images that have no healthy skin"""
#     for label_file in empty_label_files:
#         # Remove label file
#         print(f"Removing: {label_file}")
#         label_file.unlink()
#         
#         # Find and remove corresponding image
#         image_dir = label_file.parent.parent / "images"
#         image_name = label_file.stem  # filename without .txt
#         
#         for ext in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
#             image_file = image_dir / f"{image_name}{ext}"
#             if image_file.exists():
#                 print(f"Removing: {image_file}")
#                 image_file.unlink()
#                 break
# 
# # Uncomment to remove files without healthy skin
# # remove_empty_files(empty_files)