In [6]:
import glob
from pathlib import Path

In [8]:
folders = glob.glob('./dataset/*')

In [10]:
folders

['./dataset/xamthomonas', './dataset/healthy', './dataset/segatoka']

In [18]:
# Load dataset
full_dataset = {}
for folder in folders:
    images = glob.glob(folder + '/*.jpg')
    print(folder + ': ' + str(len(images)) + ' images')
    full_dataset[folder] = images

./dataset/xamthomonas: 814 images
./dataset/healthy: 155 images
./dataset/segatoka: 320 images


In [33]:
# Helper function
def get_class_name(folder, prefix='./dataset/'):
    return folder[len(prefix):]

In [23]:
# Split dataset into train, val, and test

In [24]:
val_size = 20
test_size = 30

In [44]:
import shutil
import os
import random
random.seed(42)

for y in full_dataset.keys():
    X_raw = full_dataset[y]
    # Shuffle
    X = random.shuffle(X_raw)
    label = get_class_name(y)
    
    # Create directories
    Path("./processed/train/"+label).mkdir(parents=True, exist_ok=True)
    Path("./processed/val/"+label).mkdir(parents=True, exist_ok=True)
    Path("./processed/test/"+label).mkdir(parents=True, exist_ok=True)
    
    
    # Get splits
    val_index = len(X) - val_size - test_size
    test_index = val_index + val_size
    train_set = X[:val_index]
    val_set = X[val_index:test_index]
    test_set = X[test_index:]
    
    # Print stats
    print(label, len(train_set), len(val_set), len(test_set))
    
    # Copy files
    for src_train in train_set:
        shutil.copy(src_train, "./processed/train/"+label+'/'+os.path.basename(src_train))
    for src_val in val_set:
        shutil.copy(src_val, "./processed/val/"+label+'/'+os.path.basename(src_val))
    for src_test in test_set:
        shutil.copy(src_test, "./processed/test/"+label+'/'+os.path.basename(src_test))

xamthomonas 774 20 20
healthy 115 20 20
segatoka 280 20 20
