In [1]:
import os
import shutil
import random

In [2]:


def split_dataset(source_dir, dest_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    if not os.path.exists(source_dir):
        print(f"Source directory '{source_dir}' does not exist.")
        return

    categories = ['fake', 'real']

    for category in categories:
        category_path = os.path.join(source_dir, category)
        if not os.path.exists(category_path):
            print(f"Category folder '{category}' does not exist in source directory.")
            continue

        images = os.listdir(category_path)
        random.shuffle(images)

        total_images = len(images)
        train_count = int(total_images * train_ratio)
        val_count = int(total_images * val_ratio)
        test_count = total_images - train_count - val_count

        print(f"{category}: Total={total_images}, Train={train_count}, Validation={val_count}, Test={test_count}")

        splits = {
            'train': images[:train_count],
            'val': images[train_count:train_count + val_count],
            'test': images[train_count + val_count:],
        }

        for split_name, split_images in splits.items():
            split_dir = os.path.join(dest_dir, split_name, category)
            os.makedirs(split_dir, exist_ok=True)

            for image in split_images:
                src_path = os.path.join(category_path, image)
                dst_path = os.path.join(split_dir, image)
                shutil.copy(src_path, dst_path)

    print("Dataset splitting complete!")

In [3]:
# Example usage
source_directory = "/mnt/documents/Minor-Project/Datasets/dfdc_frame_150"  # Replace with the path to your dataset
destination_directory = "/mnt/documents/Minor-Project/Datasets/dfdc_splitted"  # Replace with the path to the destination folder

split_dataset(source_directory, destination_directory)


fake: Total=77944, Train=62355, Validation=7794, Test=7795
real: Total=16203, Train=12962, Validation=1620, Test=1621
Dataset splitting complete!
