In [None]:
import os
import random
import shutil
from itertools import islice

# ========== Configuration ==========
INPUT_FOLDER = "Dataset/all"
OUTPUT_FOLDER = "Dataset/SplitData"
SPLIT_RATIOS = {"train": 0.7, "val": 0.2, "test": 0.1}
CLASSES = ["fake", "real"]

# ========== Cleanup Previous Output ==========
if os.path.exists(OUTPUT_FOLDER):
    shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# ========== Create Output Subdirectories ==========
for split in SPLIT_RATIOS:
    os.makedirs(f"{OUTPUT_FOLDER}/{split}/images", exist_ok=True)
    os.makedirs(f"{OUTPUT_FOLDER}/{split}/labels", exist_ok=True)

# ========== Collect & Shuffle File Base Names ==========
all_files = os.listdir(INPUT_FOLDER)
base_names = list(set(name.split('.')[0] for name in all_files))
random.shuffle(base_names)

# ========== Calculate Split Sizes ==========
total_files = len(base_names)
len_train = int(total_files * SPLIT_RATIOS["train"])
len_val = int(total_files * SPLIT_RATIOS["val"])
len_test = total_files - len_train - len_val  # Ensure 100%

splits = [len_train, len_val, len_test]
split_names = ["train", "val", "test"]
split_data = [list(islice(iter(base_names), count)) for count in splits]

print(f"📊 Total: {total_files} files")
print(f"📂 Split: {len_train} train, {len_val} val, {len_test} test")

# ========== Copy Files into Split Folders ==========
for i, split_name in enumerate(split_names):
    for file_base in split_data[i]:
        img_src = os.path.join(INPUT_FOLDER, f"{file_base}.jpg")
        label_src = os.path.join(INPUT_FOLDER, f"{file_base}.txt")
        img_dst = os.path.join(OUTPUT_FOLDER, split_name, "images", f"{file_base}.jpg")
        label_dst = os.path.join(OUTPUT_FOLDER, split_name, "labels", f"{file_base}.txt")

        try:
            shutil.copy(img_src, img_dst)
            shutil.copy(label_src, label_dst)
        except FileNotFoundError:
            print(f"⚠️ Skipped missing pair: {file_base}")

print("✅ Dataset splitting complete.")

# ========== Create data.yaml for YOLO ==========
data_yaml_content = f'''path: .
train: train/images
val: val/images
test: test/images

nc: {len(CLASSES)}
names: {CLASSES}
'''

with open(os.path.join(OUTPUT_FOLDER, "data.yaml"), 'w') as yaml_file:
    yaml_file.write(data_yaml_content)

print("✅ data.yaml file created.")
