In [None]:
import os
import shutil
import random
from pathlib import Path
from tqdm import tqdm

# --- CONFIGURATION ---

# Random seed for reproducibility
RANDOM_STATE = 42
random.seed(RANDOM_STATE)

# =============================================================================
# DIRECTORY CONFIGURATION (Relative Paths)
# =============================================================================

# 1. SOURCE_DIR: Original raw data (Read-Only)
# Ensure this folder exists inside your project root 'data' folder
SOURCE_DIR = Path("data/MUESTRAS_ini")

# 2. DEST_DIR: Target directory for the experiment (Files will be copied here)
DEST_DIR = Path("data/experimento_baselines")

# 3. Target Carriers
CARRIER_NAMES = [
    "Carrier_C1_675",
    "Carrier_C2_2825",
    "Carrier_C3_2975",
    "Carrier_C4_9435"
]

# 4. Output Folder Names
TRAIN_FOLDER_NAME = "train_set_70"
TEST_FOLDER_NAME = "test_set_15"

# 5. Split Configuration (Fixed number of samples)
# 70% of 50 images = 35 images for Training
TRAIN_COUNT = 35

# 6. Directories to ignore during traversal
IGNORE_DIRS = [
    "SIN_CLASIFICAR",
    TRAIN_FOLDER_NAME,
    TEST_FOLDER_NAME,
    "validation_set_15"
]

def split_files_for_carrier(src_carrier_path: Path, dest_carrier_path: Path):
    """
    Processes a single carrier:
    1. Identifies class subdirectories.
    2. Shuffles images for randomness.
    3. Splits into Train/Test sets.
    4. Copies files to the destination structure.
    """
    
    print(f"\n--- Processing Carrier: {src_carrier_path.name} ---")
    
    # 1. Create destination directory structure
    os.makedirs(dest_carrier_path, exist_ok=True)
    train_dest_root = dest_carrier_path / TRAIN_FOLDER_NAME
    test_dest_root = dest_carrier_path / TEST_FOLDER_NAME
    os.makedirs(train_dest_root, exist_ok=True)
    os.makedirs(test_dest_root, exist_ok=True)
    
    # 2. Identify Class Directories in Source
    try:
        class_dirs = [
            d for d in src_carrier_path.iterdir() 
            if d.is_dir() and d.name not in IGNORE_DIRS
        ]
    except FileNotFoundError:
        print(f"[ERROR] Source directory {src_carrier_path} not found. Skipping.")
        return

    if not class_dirs:
        print(f"[WARN] No class directories found in {src_carrier_path.name}")
        return

    print(f"   > Found {len(class_dirs)} classes. Starting split operation...")

    # 3. Iterate over each Class (e.g., ARM_ANCHO)
    for class_dir in tqdm(class_dirs, desc="   > Processing classes"):
        class_name = class_dir.name
        
        # 4. Create Class Subdirectories in Destination
        dest_train_class = train_dest_root / class_name
        dest_test_class = test_dest_root / class_name
        os.makedirs(dest_train_class, exist_ok=True)
        os.makedirs(dest_test_class, exist_ok=True)
        
        # 5. Collect all image files
        image_files = list(class_dir.glob('*.png')) + \
                      list(class_dir.glob('*.jpg')) + \
                      list(class_dir.glob('*.jpeg'))
        
        if not image_files:
            print(f"[INFO] Class '{class_name}' is empty. Skipping.")
            continue
            
        total_files = len(image_files)
        
        # 6. Shuffle files (Critical for random split)
        random.shuffle(image_files)
        
        # 7. Split Logic
        files_for_train = image_files[:TRAIN_COUNT]
        files_for_test = image_files[TRAIN_COUNT:]
        
        # 8. Copy to Training Set
        for file_path in files_for_train:
            try:
                shutil.copy(str(file_path), str(dest_train_class))
            except Exception as e:
                print(f"[ERROR] Copy failed for {file_path.name}: {e}")
                
        # 9. Copy to Test Set
        for file_path in files_for_test:
            try:
                shutil.copy(str(file_path), str(dest_test_class))
            except Exception as e:
                print(f"[ERROR] Copy failed for {file_path.name}: {e}")
        
        # Log summary for verification
        # (Optional: Uncomment for verbose output)
        # print(f"Class '{class_name}': {len(files_for_train)} Train / {len(files_for_test)} Test")

def main():
    """
    Main execution loop iterating over all defined carriers.
    """
    print("="*60)
    print("DATA SPLIT AND PREPARATION SCRIPT")
    print(f"Source: {SOURCE_DIR}")
    print(f"Destination: {DEST_DIR}")
    print("="*60)
    
    if not SOURCE_DIR.exists():
        print(f"[CRITICAL] Source directory not found: {SOURCE_DIR}")
        print("Please ensure the data directory structure is correct.")
        return

    for carrier_name in CARRIER_NAMES:
        src_path = SOURCE_DIR / carrier_name
        dest_path = DEST_DIR / carrier_name
        
        split_files_for_carrier(src_path, dest_path)
        
    print("\n" + "="*60)
    print("PROCESS COMPLETED SUCCESSFULLY")
    print(f"Output directory: {DEST_DIR}")
    print("="*60)

if __name__ == "__main__":
    main()