# CREMA-D Audio Preprocessing Pipeline
This notebook preprocesses the CREMA-D audio dataset (https://www.kaggle.com/datasets/ejlok1/cremad) stored in Google Drive. It creates stratified train/test/val splits, generates mel spectrograms, extracts embeddings using ResNet18 and VGG16, and computes MFCC features.

Before running, ensure your Google Drive is mounted and `DATA_ROOT` points to the folder containing the `AudioWAV` directory.

In [1]:
# Mount Google Drive (only in Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ImportError:
    # Not running in Colab
    pass

Mounted at /content/drive


In [2]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import librosa
from PIL import Image
import torch
from torchvision import models, transforms
import warnings
warnings.filterwarnings('ignore')

# Set the root directory on your Google Drive
DATA_ROOT = '/content/drive/MyDrive/CREMAD'  # modify if needed
AUDIO_DIR = os.path.join(DATA_ROOT, 'AudioWAV')
SPLITS_DIR = os.path.join(DATA_ROOT, 'splits')
SPEC_DIR = os.path.join(DATA_ROOT, 'Spectrograms')
EMB_DIR = os.path.join(DATA_ROOT, 'Embeddings')
MFCC_DIR = os.path.join(DATA_ROOT, 'MFCCs')

# Create base directories if they do not exist
os.makedirs(SPLITS_DIR, exist_ok=True)
os.makedirs(SPEC_DIR, exist_ok=True)
os.makedirs(EMB_DIR, exist_ok=True)
os.makedirs(MFCC_DIR, exist_ok=True)

# Emotion classes in CREMA-D
CLASSES = ['HAP', 'SAD', 'ANG', 'NEU', 'DIS', 'FEA']

# Determine the device for PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# === Create stratified train/test/validation splits ===
file_paths = glob.glob(os.path.join(AUDIO_DIR, '*.wav'))
records = []

for fp in file_paths:
    fname = os.path.basename(fp)
    # Identify the class label by searching for known class substrings in the file name
    label_found = None
    for cls in CLASSES:
        if cls in fname:
            label_found = cls
            break
    # Append record if a valid class is found
    if label_found is not None:
        records.append({
            'file_path': fp,
            'file_name': fname,
            'label': label_found
        })

# Create DataFrame
df = pd.DataFrame(records)

# Check that the dataframe is not empty
if df.empty:
    raise ValueError('No audio files found. Please check the AUDIO_DIR path.')

# Perform stratified split: 70% train, 20% test, 10% validation
train_val_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.125, stratify=train_val_df['label'], random_state=42)  # 0.125 of 80% = 10%

# Save splits to CSV files
train_df.to_csv(os.path.join(SPLITS_DIR, 'train.csv'), index=False)
test_df.to_csv(os.path.join(SPLITS_DIR, 'test.csv'), index=False)
val_df.to_csv(os.path.join(SPLITS_DIR, 'val.csv'), index=False)

print(f'Train size: {len(train_df)}, Test size: {len(test_df)}, Val size: {len(val_df)}')

Train size: 5208, Test size: 1489, Val size: 745


In [4]:
# === Helper functions ===
def create_dir(path):
    """Create a directory if it doesn't exist.
    Args:
        path (str): Path to the directory.
    """
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

def save_melspectrogram(audio_path, save_path):
    """Generate and save a 224x224 mel spectrogram from an audio file.
    Args:
        audio_path (str): Path to the input WAV file.
        save_path (str): Path to save the generated PNG file.
    """
    try:
        y, sr = librosa.load(audio_path, sr=None)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel_spec, ref=np.max)
        # Normalize between 0 and 255
        mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min())
        mel_img = (mel_norm * 255).astype(np.uint8)
        img = Image.fromarray(mel_img)
        img = img.resize((224, 224))
        img.save(save_path)
    except Exception as e:
        print(f'Error generating spectrogram for {audio_path}: {e}')

def load_image(image_path):
    """Load an image and convert it to RGB.
    Args:
        image_path (str): Path to the image file.
    Returns:
        PIL.Image: Loaded image.
    """
    img = Image.open(image_path).convert('RGB')
    return img

# Transformation for embedding extraction
embedding_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [5]:
# === Generate mel spectrograms ===
def process_spectrograms(split_df, split_name):
    """Generate mel spectrogram PNG files for each audio in the split.
    Args:
        split_df (pd.DataFrame): DataFrame containing file_path, file_name and label.
        split_name (str): Name of the split ('train', 'test' or 'val').
    """
    for _, row in tqdm(split_df.iterrows(), total=len(split_df), desc=f'Generating spectrograms for {split_name}'):
        file_path = row['file_path']
        file_name_png = os.path.splitext(row['file_name'])[0] + '.png'
        label = row['label']
        save_path = os.path.join(SPEC_DIR, split_name, label, file_name_png)
        create_dir(os.path.dirname(save_path))
        if not os.path.exists(save_path):
            save_melspectrogram(file_path, save_path)

# Process spectrograms for each split
process_spectrograms(train_df, 'train')
process_spectrograms(test_df, 'test')
process_spectrograms(val_df, 'val')

Generating spectrograms for train:   0%|          | 0/5208 [00:00<?, ?it/s]

Generating spectrograms for test:   0%|          | 0/1489 [00:00<?, ?it/s]

Generating spectrograms for val:   0%|          | 0/745 [00:00<?, ?it/s]

In [6]:
# === Generate embeddings using pretrained models ===
def extract_embeddings(model, model_name, split_name):
    """Extract and save embeddings from spectrogram images for a given model and split.
    Args:
        model (torch.nn.Module): Pretrained model with final layer removed.
        model_name (str): Name of the model (e.g., 'ResNet18' or 'VGG16').
        split_name (str): Name of the split ('train', 'test' or 'val').
    """
    model.to(device)
    model.eval()
    base_dir = os.path.join(EMB_DIR, model_name, split_name)
    for label in CLASSES:
        create_dir(os.path.join(base_dir, label))
    split_dir = os.path.join(SPEC_DIR, split_name)
    for label in CLASSES:
        class_dir = os.path.join(split_dir, label)
        image_files = glob.glob(os.path.join(class_dir, '*.png'))
        for img_path in tqdm(image_files, desc=f'Embedding {model_name} - {split_name} - {label}', leave=False):
            try:
                img = load_image(img_path)
                tensor = embedding_transform(img).unsqueeze(0).to(device)
                with torch.no_grad():
                    feat = model(tensor)
                embedding = feat.squeeze().cpu().numpy().flatten()
                file_name = os.path.splitext(os.path.basename(img_path))[0]
                save_path = os.path.join(base_dir, label, f'{file_name}.npz')
                np.savez(save_path, embedding=embedding, label=label, file_name=file_name)
            except Exception as e:
                print(f'Error extracting embedding for {img_path}: {e}')

# Load pretrained models and remove classification layers
resnet18 = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet18.fc = torch.nn.Identity()

vgg16 = models.vgg16(weights=models.VGG16_Weights.DEFAULT)
vgg16.classifier = torch.nn.Sequential(*list(vgg16.classifier.children())[:-1])

# Extract embeddings for each split
for split_name in ['train', 'test', 'val']:
    extract_embeddings(resnet18, 'ResNet18', split_name)
    extract_embeddings(vgg16, 'VGG16', split_name)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 206MB/s]


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


100%|██████████| 528M/528M [00:04<00:00, 129MB/s]


Embedding ResNet18 - train - HAP:   0%|          | 0/889 [00:00<?, ?it/s]

Embedding ResNet18 - train - SAD:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding ResNet18 - train - ANG:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding ResNet18 - train - NEU:   0%|          | 0/760 [00:00<?, ?it/s]

Embedding ResNet18 - train - DIS:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding ResNet18 - train - FEA:   0%|          | 0/889 [00:00<?, ?it/s]

Embedding VGG16 - train - HAP:   0%|          | 0/889 [00:00<?, ?it/s]

Embedding VGG16 - train - SAD:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding VGG16 - train - ANG:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding VGG16 - train - NEU:   0%|          | 0/760 [00:00<?, ?it/s]

Embedding VGG16 - train - DIS:   0%|          | 0/890 [00:00<?, ?it/s]

Embedding VGG16 - train - FEA:   0%|          | 0/889 [00:00<?, ?it/s]

Embedding ResNet18 - test - HAP:   0%|          | 0/255 [00:00<?, ?it/s]

Embedding ResNet18 - test - SAD:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding ResNet18 - test - ANG:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding ResNet18 - test - NEU:   0%|          | 0/218 [00:00<?, ?it/s]

Embedding ResNet18 - test - DIS:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding ResNet18 - test - FEA:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding VGG16 - test - HAP:   0%|          | 0/255 [00:00<?, ?it/s]

Embedding VGG16 - test - SAD:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding VGG16 - test - ANG:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding VGG16 - test - NEU:   0%|          | 0/218 [00:00<?, ?it/s]

Embedding VGG16 - test - DIS:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding VGG16 - test - FEA:   0%|          | 0/254 [00:00<?, ?it/s]

Embedding ResNet18 - val - HAP:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding ResNet18 - val - SAD:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding ResNet18 - val - ANG:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding ResNet18 - val - NEU:   0%|          | 0/109 [00:00<?, ?it/s]

Embedding ResNet18 - val - DIS:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding ResNet18 - val - FEA:   0%|          | 0/128 [00:00<?, ?it/s]

Embedding VGG16 - val - HAP:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding VGG16 - val - SAD:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding VGG16 - val - ANG:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding VGG16 - val - NEU:   0%|          | 0/109 [00:00<?, ?it/s]

Embedding VGG16 - val - DIS:   0%|          | 0/127 [00:00<?, ?it/s]

Embedding VGG16 - val - FEA:   0%|          | 0/128 [00:00<?, ?it/s]

In [7]:
# === Generate MFCCs ===
def process_mfccs(split_df, split_name):
    """Generate and save MFCC features for each audio file in the split.
    Args:
        split_df (pd.DataFrame): DataFrame containing file_path, file_name and label.
        split_name (str): Name of the split ('train', 'test' or 'val').
    """
    for _, row in tqdm(split_df.iterrows(), total=len(split_df), desc=f'Generating MFCCs for {split_name}'):
        file_path = row['file_path']
        file_name = os.path.splitext(row['file_name'])[0]
        label = row['label']
        save_path = os.path.join(MFCC_DIR, split_name, label, f'{file_name}.npz')
        create_dir(os.path.dirname(save_path))
        if not os.path.exists(save_path):
            try:
                y, sr = librosa.load(file_path, sr=None)
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                np.savez(save_path, mfcc=mfcc, label=label, file_name=file_name)
            except Exception as e:
                print(f'Error generating MFCC for {file_path}: {e}')

# Generate MFCCs for each split
process_mfccs(train_df, 'train')
process_mfccs(test_df, 'test')
process_mfccs(val_df, 'val')

Generating MFCCs for train:   0%|          | 0/5208 [00:00<?, ?it/s]

Generating MFCCs for test:   0%|          | 0/1489 [00:00<?, ?it/s]

Generating MFCCs for val:   0%|          | 0/745 [00:00<?, ?it/s]

## Processing complete
Your data has been split into train, test and validation sets, mel spectrograms generated, embeddings extracted using ResNet18 and VGG16, and MFCC features computed.

Adjust the `DATA_ROOT` variable at the top if your data is stored elsewhere.