In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import gc
import pickle
import sys

import librosa
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from audiomentations import AddGaussianNoise, Compose, PitchShift, Shift, TimeStretch
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

2024-05-05 17:30:41.728707: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-05 17:30:41.755260: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 17:30:41.755290: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 17:30:41.755296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-05 17:30:41.759596: I tensorflow/core/platform/cpu_feature_g

In [3]:
emotion_list = {
    0: "neutral",
    1: "calm",
    2: "happy",
    3: "sad",
    4: "angry",
    5: "fearful",
    6: "disgusted",
    7: "surprised",
}
RAV = "data/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
dir_list = os.listdir(RAV)
paths = []
for dir in dir_list:
    files = os.listdir(RAV + dir)
    for file in files:
        paths.append(RAV + dir + "/" + file)

path_train, path_test = train_test_split(paths, test_size=0.2)

In [4]:
augment = Compose(
    [
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.002, p=0.5),  # p = 0.5
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
        Shift(p=0.5),
    ]
)

In [5]:
def get_label(path, emotion_list):
    fileName = os.path.basename(path)
    parts = fileName.split(".")[0].split("-")
    return int(parts[2]) - 1


def process_test_audios(paths):
    batch_features, batch_labels = [], []
    for path in paths:
        audio, sample_rate = librosa.load(path)
        batch_features.append(audio)
        batch_labels.append(get_label(path, emotion_list))
    return batch_features, batch_labels


def audio_generator(file_paths, batch_size):
    while True:
        batch_paths = np.random.choice(a=file_paths, size=batch_size)
        batch_features, batch_labels = [], []

        for path in batch_paths:
            audio, sample_rate = librosa.load(path)
            augmented_audio = augment(samples=audio, sample_rate=sample_rate)
            batch_features.append(augmented_audio)
            batch_labels.append(get_label(path, emotion_list))

        yield batch_features, batch_labels


def extract_mfcc(audio):
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=40).T, axis=0)
    return mfccs


In [6]:
X_test, Y_test = process_test_audios(path_test)

batch_size = 24
gen = audio_generator(file_paths=path_train, batch_size=batch_size)
X_train, Y_train = [], []
for i in range(500):
    batch_data, batch_labels = next(gen)
    X_train += batch_data
    Y_train += batch_labels


Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)
len(X_train), len(Y_train)

(12000, 12000)

In [7]:
# data = {"X_train_raw": X_train, "X_test_raw": X_test, "Y_train": Y_train, "Y_test": Y_test}
# with open("./data/raw_data.pickle", "wb") as f:
#     pickle.dump(data, f)

In [8]:
# with open("./data/raw_data.pickle", "rb") as f:
#     data = pickle.load(f)
# X_train = data["X_train_raw"]
# X_test = data["X_test_raw"]
# Y_train = data["Y_train"]
# Y_test = data["Y_test"]

In [9]:
X_train = np.array([extract_mfcc(audio) for audio in X_train])
X_test = np.array([extract_mfcc(audio) for audio in X_test])
X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)

In [10]:
# data = {"X_train": X_train, "X_test": X_test, "Y_train": Y_train, "Y_test": Y_test}
# with open("./data/processed_data.pickle", "wb") as f:
#     pickle.dump(data, f)

In [11]:
# with open("./data/processed_data.pickle", "rb") as f:
#     data = pickle.load(f)
# X_train = data["X_train"]
# X_test = data["X_test"]
# Y_train = data["Y_train"]
# Y_test = data["Y_test"]

In [12]:
X_train.shape

(12000, 40, 1)