# Data folder creation from downloaded dataset

In [None]:
import os
import shutil
import random

In [4]:
def count_files_in_subfolders(folder_path):
    results = {}
    # Loop through items in the main folder
    for entry in os.scandir(folder_path):
        if entry.is_dir():
            # Count files inside each subfolder
            file_count = sum(1 for item in os.scandir(entry.path) if item.is_file())
            results[entry.name] = file_count
    return results

# Example usage:
folder_path = "../dataset"  # change this to your folder
counts = count_files_in_subfolders(folder_path)

for subfolder, count in counts.items():
    print(f"{subfolder}: {count} files")

dew: 698 files
fogsmog: 851 files
frost: 475 files
glaze: 639 files
hail: 591 files
lightning: 377 files
rain: 526 files
rainbow: 232 files
rime: 1160 files
sandstorm: 692 files
snow: 621 files


In [8]:
def split_dataset(source_folder, dest_folder, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    # Create destination folders
    for split in ['train', 'val', 'test']:
        split_path = os.path.join(dest_folder, split)
        os.makedirs(split_path, exist_ok=True)

    # Loop over each class/subfolder in the source
    for class_name in os.listdir(source_folder):
        class_path = os.path.join(source_folder, class_name)
        if not os.path.isdir(class_path):
            continue  # skip files

        # Get all files in this class
        files = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
        random.shuffle(files)

        n_total = len(files)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        n_test = n_total - n_train - n_val

        splits = {
            'train': files[:n_train],
            'val': files[n_train:n_train+n_val],
            'test': files[n_train+n_val:]
        }

        # Copy files to the new structure
        for split_name, split_files in splits.items():
            split_class_folder = os.path.join(dest_folder, split_name, class_name)
            os.makedirs(split_class_folder, exist_ok=True)
            for f in split_files:
                src_file = os.path.join(class_path, f)
                dst_file = os.path.join(split_class_folder, f)
                shutil.copy2(src_file, dst_file)

    print("Dataset split completed!")

# Example usage:
source_folder = "../dataset"
dest_folder = "../data"
split_dataset(source_folder, dest_folder)


Dataset split completed!


In [10]:
folders = ["../data/train", "../data/val", "../data/test"]

for folder in folders:
    print(f'----In {folder} we have----')
    counts = count_files_in_subfolders(folder)
    for subfolder, count in counts.items():
        print(f"{subfolder}: {count} files")

----In ../data/train we have----
dew: 558 files
fogsmog: 680 files
frost: 380 files
glaze: 511 files
hail: 472 files
lightning: 301 files
rain: 420 files
rainbow: 185 files
rime: 928 files
sandstorm: 553 files
snow: 496 files
----In ../data/val we have----
dew: 69 files
fogsmog: 85 files
frost: 47 files
glaze: 63 files
hail: 59 files
lightning: 37 files
rain: 52 files
rainbow: 23 files
rime: 116 files
sandstorm: 69 files
snow: 62 files
----In ../data/test we have----
dew: 71 files
fogsmog: 86 files
frost: 48 files
glaze: 65 files
hail: 60 files
lightning: 39 files
rain: 54 files
rainbow: 24 files
rime: 116 files
sandstorm: 70 files
snow: 63 files
