In [11]:
import os
import shutil
from sklearn.model_selection import train_test_split
from PIL import Image
import cv2
import random

In [5]:
classes = ['happy', 'neutral', 'sad']
dirs = ['train', 'validation']

In [6]:
# Check how many images are in each class
for c in classes:
    print(f'{c}: {len(os.listdir(f"../data/raw/{c}"))}')

happy: 12729
neutral: 10225
sad: 10011


In [8]:
def calculate_noise_variance(image_path):
    """Calculate the variance of Laplacian for an image"""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    laplacian = cv2.Laplacian(image, cv2.CV_64F)
    variance = laplacian.var()
    return variance

classes = ['happy', 'sad', 'neutral']
input_dir = "../data/raw"
output_dir = "../data/preprocessed/filtered"

os.makedirs(output_dir, exist_ok=True)
for c in classes:
    os.makedirs(os.path.join(output_dir, c), exist_ok=True)

for c in classes:
    class_folder = os.path.join(input_dir, c)
    images = os.listdir(class_folder)

    filtered_images = [
        img for img in images
        if calculate_noise_variance(os.path.join(class_folder, img)) > 100
    ]

    for img in filtered_images:
        image_path = os.path.join(class_folder, img)
        output_path = os.path.join(output_dir, c, img)

        image = cv2.imread(image_path)
        if image is not None:
            cv2.imwrite(output_path, image)
        else:
            print(f"Error loading image: {image_path}")

print("Successfully filtered images")

Successfully filtered images


In [9]:
# Check how many images are in each class after filtering
for c in classes:
    print(f'{c}: {len(os.listdir(f"../data/preprocessed/filtered/{c}"))}')


happy: 11791
sad: 8505
neutral: 8943


In [15]:
# Dawnsampling for each class to have the same amount of images in each class (the amount of images in the class with the smallest amount of images). Using random_state=42
input_dir = "../data/preprocessed/filtered"
output_dir = "../data/preprocessed/downsampled"

os.makedirs(output_dir, exist_ok=True)
for c in classes:
    class_output_folder = os.path.join(output_dir, c)
    if os.path.exists(class_output_folder):
        for f in os.listdir(class_output_folder):
            os.remove(os.path.join(class_output_folder, f))
    else:
        os.makedirs(class_output_folder)

min_images = min([len(os.listdir(os.path.join(input_dir, c))) for c in classes])
print(f"Number of images in the smallest class: {min_images}")

for c in classes:
    class_folder = os.path.join(input_dir, c)
    images = os.listdir(class_folder)

    downsampled_images = random.sample(images, min_images)

    print(f"Downsampling {c} class")
    print(f"Original number of images: {len(images)}")
    print(f"Downsampled number of images: {len(downsampled_images)}")

    for img in downsampled_images:
        image_path = os.path.join(class_folder, img)
        output_path = os.path.join(output_dir, c, img)

        image = cv2.imread(image_path)
        if image is not None:
            cv2.imwrite(output_path, image)
        else:
            print(f"Error loading image: {image_path}")

print("Downsampling completed")

Number of images in the smallest class: 8505
Downsampling happy class
Original number of images: 11791
Downsampled number of images: 8505
Downsampling sad class
Original number of images: 8505
Downsampled number of images: 8505
Downsampling neutral class
Original number of images: 8943
Downsampled number of images: 8505
Downsampling completed


In [23]:
# Check how many images are in each class after downsampling
for c in classes:
    print(f'{c}: {len(os.listdir(f"../data/preprocessed/downsampled/{c}"))}')

happy: 8505
sad: 8505
neutral: 8505


In [25]:
# Resize images to 299x299

target_size = (299, 299)
input_dir = "../data/preprocessed/downsampled"
output_dir = "../data/preprocessed/resized"

for c in classes:
    class_folder = os.path.join(input_dir, c)
    output_class_folder = os.path.join(output_dir, c)
    os.makedirs(output_class_folder, exist_ok=True)

    images = os.listdir(class_folder)
    for img_name in images:
        img_path = os.path.join(class_folder, img_name)

        img = cv2.imread(img_path)
        if img is None:
            print(f"Error loading image: {img_path}")
            continue

        resized_img = cv2.resize(img, target_size)

        output_path = os.path.join(output_class_folder, img_name)
        cv2.imwrite(output_path, resized_img)

print("Successfully resized images")

Successfully resized images


In [26]:
# Check how many images are in each class after resizing
for c in classes:
    print(f'{c}: {len(os.listdir(f"../data/preprocessed/resized/{c}"))}')

happy: 8505
sad: 8505
neutral: 8505


In [27]:
# Split data into train and validation sets
input_dir = "../data/preprocessed/resized"
output_dir = "../data/final"

os.makedirs(output_dir, exist_ok=True)
for d in dirs:
    os.makedirs(os.path.join(output_dir, d), exist_ok=True)
    for c in classes:
        os.makedirs(os.path.join(output_dir, d, c), exist_ok=True)

test_size = 0.2

for c in classes:
    class_folder = os.path.join(input_dir, c)
    images = os.listdir(class_folder)

    train_images, val_images = train_test_split(images, test_size=test_size, random_state=42)

    for img, target_dir in [(train_images, 'train'), (val_images, 'validation')]:
        for image_name in img:
            image_path = os.path.join(class_folder, image_name)
            output_path = os.path.join(output_dir, target_dir, c, image_name)

            image = cv2.imread(image_path)
            if image is not None:
                cv2.imwrite(output_path, image)
            else:
                print(f"Error loading image: {image_path}")

print("Successfully split data into train and validation sets")

Successfully split data into train and validation sets


In [28]:
# Check how many images are in each class after splitting
for d in dirs:
    for c in classes:
        print(f'{d} {c}: {len(os.listdir(f"../data/final/{d}/{c}"))}')

train happy: 6804
train sad: 6804
train neutral: 6804
validation happy: 1701
validation sad: 1701
validation neutral: 1701
