In [1]:
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_DATA_DIR = os.path.join(BASE_DIR, 'raw')
PROCESSED_IMAGES_DIR = os.path.join(BASE_DIR, 'processed/images')
PROCESSED_AUDIO_DIR = os.path.join(BASE_DIR, 'processed/audio')
LABELS_PATH = os.path.join(BASE_DIR, 'processed/labels.csv')

os.makedirs(PROCESSED_IMAGES_DIR, exist_ok=True)
os.makedirs(PROCESSED_AUDIO_DIR, exist_ok=True)

In [2]:
import os
import shutil
import random
from glob import glob

def sort_and_copy_images(source_dirs, processed_dir, split_ratio):
    emotions = os.listdir(source_dirs[0]) 

    for emotion in emotions:

        files = []
        for source_dir in source_dirs:
            emotion_path = os.path.join(source_dir, emotion)
            if os.path.exists(emotion_path):
                files.extend(glob(os.path.join(emotion_path, '*.jpg')))

        random.shuffle(files)

        total_files = len(files)
        train_count = int(total_files * split_ratio['train'])
        test_count = int(total_files * split_ratio['test'])

        splits = {
            'train': files[:train_count],
            'test': files[train_count:train_count+test_count],
            'validation': files[train_count+test_count:]
        }

        for split, split_files in splits.items():
            target_dir = os.path.join(processed_dir, split, emotion)
            os.makedirs(target_dir, exist_ok=True)

            for idx, file_path in enumerate(split_files, start=1):
                ext = os.path.splitext(file_path)[1]
                new_file_name = f"{split}_{emotion}_{idx:04d}{ext}"
                target_path = os.path.join(target_dir, new_file_name)
                shutil.copy(file_path, target_path)

In [3]:
split_ratio = {'train': 0.7, 'test': 0.2, 'validation': 0.1}
CK_DIR = os.path.join(RAW_DATA_DIR, 'ck')
FER_TRAIN_DIR = os.path.join(RAW_DATA_DIR, 'fer/train')
FER_TEST_DIR = os.path.join(RAW_DATA_DIR, 'fer/test')

sort_and_copy_images(
    source_dirs=[CK_DIR, FER_TRAIN_DIR, FER_TEST_DIR],
    processed_dir=PROCESSED_IMAGES_DIR,
    split_ratio=split_ratio
)

In [4]:
def sort_and_copy_audio(source_dir, processed_dir, split_ratio):
    emotion_map = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fear',
        '07': 'disgust',
        '08': 'surprised'
    }

    allowed_emotions = {'angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprised'}
    files = glob(os.path.join(source_dir, 'Actor_*', '*.wav'))

    random.shuffle(files)

    total = len(files)
    train_count = int(total * split_ratio['train'])
    test_count = int(total * split_ratio['test'])

    splits = {
        'train': files[:train_count],
        'test': files[train_count:train_count+test_count],
        'validation': files[train_count+test_count:]
    }

    for split, split_files in splits.items():
        counters = {}

        for file_path in split_files:
            file_name = os.path.basename(file_path)
            parts = file_name.split('-')
            if len(parts) < 7:
                continue

            parts = parts[2:]
            
            emotion_code = parts[0]
            emotion = emotion_map.get(emotion_code, 'unknown')

            if emotion not in allowed_emotions:
                continue 

            target_dir = os.path.join(processed_dir, split, emotion)
            os.makedirs(target_dir, exist_ok=True)

            counters.setdefault(emotion, 0)
            counters[emotion] += 1

            parts[0] = emotion
            new_file_name_base = '-'.join(parts[:-1])
            ext = os.path.splitext(file_name)[1]
            new_file_name = f"{split}-{new_file_name_base}-{counters[emotion]:04d}{ext}"

            shutil.copy(file_path, os.path.join(target_dir, new_file_name))


In [5]:
RAW_AUDIO_DIR = os.path.join(RAW_DATA_DIR, 'ravdess')
PROCESSED_AUDIO_DIR = os.path.join(BASE_DIR, 'processed/audio')
split_ratio = {'train': 0.7, 'test': 0.2, 'validation': 0.1}

sort_and_copy_audio(RAW_AUDIO_DIR, PROCESSED_AUDIO_DIR, split_ratio)

In [6]:
import csv

def create_image_labels_csv(processed_images_dir, output_csv_path):
    rows = []
    splits = ['train', 'test', 'validation']

    for split in splits:
        split_path = os.path.join(processed_images_dir, split)
        if not os.path.exists(split_path):
            continue
        for emotion in os.listdir(split_path):
            emotion_path = os.path.join(split_path, emotion)
            if not os.path.isdir(emotion_path):
                continue
            for filename in os.listdir(emotion_path):
                if filename.lower().endswith(('.jpg', '.png')):
                    rows.append([filename, split, emotion])

    with open(output_csv_path, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'split', 'emotion'])
        writer.writerows(rows)


In [7]:
def create_audio_labels_csv(processed_audio_dir, output_csv_path):
    rows = []
    splits = ['train', 'test', 'validation']

    for split in splits:
        split_path = os.path.join(processed_audio_dir, split)
        if not os.path.exists(split_path):
            continue
        for emotion in os.listdir(split_path):
            emotion_path = os.path.join(split_path, emotion)
            if not os.path.isdir(emotion_path):
                continue

            for filename in os.listdir(emotion_path):
                if filename.lower().endswith('.wav'):
                    parts = filename.split('-')
                    actor = parts[3] if len(parts) > 4 else 'unknown'
                    rows.append([filename, split, emotion, actor])

    with open(output_csv_path, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filename', 'split', 'emotion', 'actor'])
        writer.writerows(rows)

In [8]:
IMAGE_LABEL_PATH = os.path.join(BASE_DIR, 'processed', 'image_labels.csv')
AUDIO_LABEL_PATH = os.path.join(BASE_DIR, 'processed', 'audio_labels.csv')

create_image_labels_csv(PROCESSED_IMAGES_DIR, IMAGE_LABEL_PATH)
create_audio_labels_csv(PROCESSED_AUDIO_DIR, AUDIO_LABEL_PATH)