In [34]:
import numpy as np
import os
import random
import shutil
import cv2

from natsort import natsorted
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## Collect Data

from two folders with different sub-folders

In [18]:
train1_dataset_path = "data/train_Images_Source_1/"
train2_dataset_path = "data/train_Images_Source_2/"

In [19]:
dataset = []
for category in os.listdir(train1_dataset_path):
    category_path = os.path.join(train1_dataset_path, category)
    if os.path.isdir(category_path):
        for image_file in os.listdir(category_path):
            image_path = os.path.join(category_path, image_file)
            dataset.append((image_path, category))

In [20]:
for image_file in os.listdir(train2_dataset_path):
    category = image_file.split('_')[0]
    image_path = os.path.join(train2_dataset_path, image_file)
    dataset.append((image_path, category))

In [21]:
random.shuffle(dataset)

after loading data from different paths it's time to save it!

In [24]:
all_train ="data/dataset"

for image_path, category in dataset:
    category_folder = os.path.join(all_train, category)
    os.makedirs(category_folder, exist_ok=True)
    shutil.copy(image_path, category_folder)

### Data Splitting

In [25]:
train_ratio = 0.8
train_size = int(len(dataset) * train_ratio)

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:]

In [26]:
train_folder = 'data/train_set'
val_folder = 'data/val_set'

for image_path, category in train_dataset:
    category_folder = os.path.join(train_folder, category)
    os.makedirs(category_folder, exist_ok=True)
    shutil.copy(image_path, category_folder)

for image_path, category in val_dataset:
    category_folder = os.path.join(val_folder, category)
    os.makedirs(category_folder, exist_ok=True)
    shutil.copy(image_path, category_folder)

## Data Over-sampling

In [27]:
def load_and_augment_image(image_path, datagen):
    img = load_image(image_path)
    if img is not None:
        img = datagen.random_transform(img)
    return img

def load_image(image_path):
    try:
        img = cv2.imread(image_path)
        if img is not None:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img
    except Exception as e:
        print(f"Error loading image {image_path}: {str(e)}")
        return None

def save_image(img, path):
    try:
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        cv2.imwrite(path, img_bgr)
        print(f"Saved image to {path}")
    except Exception as e:
        print(f"Error saving image to {path}: {str(e)}")

In [32]:
dataset_path = 'data/dataset'

target_num_images = max([len(files) for root, dirs, files in os.walk(dataset_path)])

datagen = ImageDataGenerator(
    rotation_range=20,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [35]:
for class_folder in os.listdir(dataset_path):
    class_folder_path = os.path.join(dataset_path, class_folder)
    
    num_images = len(os.listdir(class_folder_path))
    
    num_additional_images = target_num_images - num_images
    
    if num_additional_images > 0:
        print(f'Class {class_folder} needs {num_additional_images} additional images.')
        
        augmented_folder = os.path.join(class_folder_path, 'augmented')
        os.makedirs(augmented_folder, exist_ok=True)
        
        while num_additional_images > 0:
            random_image = random.choice(os.listdir(class_folder_path))
            image_path = os.path.join(class_folder_path, random_image)
            
            img = load_and_augment_image(image_path, datagen)
            
            augmented_image_path = os.path.join(augmented_folder, f'augmented_{num_additional_images}.jpg')
            save_image(img, augmented_image_path)
            
            num_additional_images -= 1

print('Oversampling completed!')

Class Ascariasis needs 754 additional images.
Saved image to data/dataset\Ascariasis\augmented\augmented_754.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_753.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_752.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_751.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_750.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_749.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_748.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_747.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_746.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_745.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_744.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_743.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_742.jpg
Saved image to data/dataset\Ascariasis\augmented\augmented_741.jpg
Saved image to d

KeyboardInterrupt: 

## Data Under-sampling

In [37]:
all_train_path = "data/dataset"

In [38]:
dataset = []
for category in os.listdir(all_train_path):
    category_path = os.path.join(all_train_path, category)
    if os.path.isdir(category_path):
        for image_file in os.listdir(category_path):
            image_path = os.path.join(category_path, image_file)
            dataset.append((image_path, category))

In [39]:
random.shuffle(dataset)

In [40]:
train_ratio = 0.8
train_size = int(len(dataset) * train_ratio)

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:]

In [42]:
train_folder = 'data/train_set_undersample'
val_folder = 'data/val_set_undersample'

for image_path, category in train_dataset:
    category_folder = os.path.join(train_folder, category)
    os.makedirs(category_folder, exist_ok=True)
    shutil.copy(image_path, category_folder)

print("train done!")

for image_path, category in val_dataset:
    category_folder = os.path.join(val_folder, category)
    os.makedirs(category_folder, exist_ok=True)
    shutil.copy(image_path, category_folder)
    
print("val done!")

train done!
val done!
