In [None]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.notebook import tqdm
import librosa
import awkward as ak
import matplotlib.pyplot as plt

# IMPORT

## LOCAL IMPORT

In [None]:
dataset_folder = pathlib.Path("../../ravdess/ravdess/")  # change it with your file location

In [None]:
file_list = list(dataset_folder.iterdir())
for i, path in enumerate(file_list):
    if path.suffix != ".wav":
        file_list.pop(i)
file_list = sorted(file_list)
file_list

In [None]:
array = list()
for path in tqdm(file_list):
    librosa_audio_segment, sr = librosa.load(path, sr=None)
    array.append([librosa_audio_segment])

  0%|          | 0/2452 [00:00<?, ?it/s]

In [None]:
b = ak.ArrayBuilder()
for path in tqdm(file_list):
    librosa_audio_segment, sr = librosa.load(path, sr=None)
    b.begin_list()
    b.begin_list()
    for value in librosa_audio_segment:
        b.real(value)
    b.end_list() 
    b.end_list() 
array = b.snapshot()

  0%|          | 0/2452 [00:00<?, ?it/s]

Filename identifiers 

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
Vocal channel (01 = speech, 02 = song).
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
Repetition (01 = 1st repetition, 02 = 2nd repetition).
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [None]:
CATEGORICAL_FEATURES_NAMES = {
    "modality": {"01": "full-AV", "02": "video-only", "03": "audio-only"}, 
    "vocal_channel": {"01": "speech", "02": "song"},
    "emotion": {"01" : "neutral", "02" : "calm", "03" : "happy", "04" : "sad", "05" : "angry", "06" : "fearful", "07" : "disgust", "08" : "surprised"},
    "emotional_intensity": {"01" : "normal", "02" : "strong"},
    "statement": {"01" : "Kids are talking by the door", "02" : "Dogs are sitting by the door"},
    "repetition": {"01" : "1st", "02" : "2nd"},
    "actor": {str(i).zfill(2): str(i).zfill(2) for i in range(1, 25)}
}

In [None]:
df = list()
file_names = list()
for path in file_list:
    stem = path.stem.split("-")
    file_names.append(path.name)
    df.append(stem)
df = pd.DataFrame(df)
df.columns = list(CATEGORICAL_FEATURES_NAMES.keys())

for column in df.columns:
    df = df.replace(CATEGORICAL_FEATURES_NAMES)

df["sex"] = ["F" if i % 2 == 0 else "M" for i in df["actor"].astype(int)]
df["filename"] = file_names

In [None]:
df.head()

Unnamed: 0,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,actor,sex,filename
0,audio-only,speech,neutral,normal,Kids are talking by the door,1st,1,M,03-01-01-01-01-01-01.wav
1,audio-only,speech,neutral,normal,Kids are talking by the door,1st,2,F,03-01-01-01-01-01-02.wav
2,audio-only,speech,neutral,normal,Kids are talking by the door,1st,3,M,03-01-01-01-01-01-03.wav
3,audio-only,speech,neutral,normal,Kids are talking by the door,1st,4,F,03-01-01-01-01-01-04.wav
4,audio-only,speech,neutral,normal,Kids are talking by the door,1st,5,M,03-01-01-01-01-01-05.wav


In [None]:
df_sorted_actor = df.sort_values(by=["actor", "filename"])
idxs_sorted = list(df_sorted_actor.index)

In [None]:
idx_train = list(df_sorted_actor[df_sorted_actor.actor.astype(int) < 19].index)
idx_test = list(df_sorted_actor[df_sorted_actor.actor.astype(int) >= 19].index)

In [None]:
X_train = array[idx_train]
X_test = array[idx_test]
Y_train = df_sorted_actor[df_sorted_actor.actor.astype(int) < 19]
Y_test = df_sorted_actor[df_sorted_actor.actor.astype(int) >= 19]

## ONLINE IMPORT (very early stage, expect bugs!)
this will cache the dataset in your conda environment.
You should be able to find the package location with:
```
import datatime
print(datatime.__file__)
```

In [None]:
# !pip install git+https://github.com/fspinna/datatime.git

In [None]:
from datatime import load_dataset

In [None]:
%%time
d = load_dataset("RavdessAudioOnly")
print(d)

Dataset Name: RavdessAudioOnly
Task: multioutput
X_train: (1828, 1, (140941, 142542, 144144, 145745, 145746, 147347, 147348, 148948, 148949, 150550, 150551, 152152, 153753, 153754, 155355, 155356, 156956, 156957, 158558, 158559, 160160, 161761, 161762, 163363, 163364, 164964, 164965, 166566, 166567, 168168, 169769, 169770, 171371, 171372, 172972, 172973, 174574, 174575, 176176, 177777, 177778, 179379, 179380, 180980, 180981, 182582, 182583, 184184, 185785, 185786, 187387, 187388, 188988, 188989, 190590, 190591, 192192, 193793, 193794, 195395, 195396, 196996, 196997, 198598, 198599, 200200, 201801, 201802, 203403, 203404, 205004, 205005, 206606, 206607, 208208, 209809, 209810, 211411, 211412, 213012, 213013, 214614, 214615, 216216, 217817, 217818, 219419, 219420, 221020, 221021, 222622, 222623, 224224, 225825, 225826, 227427, 227428, 229028, 229029, 230630, 230631, 232232, 233833, 233834, 235435, 235436, 237036, 237037, 238638, 238639, 240240, 241841, 241842, 243443, 243444, 245045, 246

In [None]:
X_train, Y_train, X_test, Y_test = d()

In [None]:
Y_train

Unnamed: 0,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,actor,sex,filename
0,audio-only,speech,neutral,normal,Kids are talking by the door,1st,1,M,03-01-01-01-01-01-01.wav
1,audio-only,speech,neutral,normal,Kids are talking by the door,2nd,1,M,03-01-01-01-01-02-01.wav
2,audio-only,speech,neutral,normal,Dogs are sitting by the door,1st,1,M,03-01-01-01-02-01-01.wav
3,audio-only,speech,neutral,normal,Dogs are sitting by the door,2nd,1,M,03-01-01-01-02-02-01.wav
4,audio-only,speech,calm,normal,Kids are talking by the door,1st,1,M,03-01-02-01-01-01-01.wav
...,...,...,...,...,...,...,...,...,...
1823,audio-only,speech,surprised,normal,Dogs are sitting by the door,2nd,18,F,03-01-08-01-02-02-18.wav
1824,audio-only,speech,surprised,strong,Kids are talking by the door,1st,18,F,03-01-08-02-01-01-18.wav
1825,audio-only,speech,surprised,strong,Kids are talking by the door,2nd,18,F,03-01-08-02-01-02-18.wav
1826,audio-only,speech,surprised,strong,Dogs are sitting by the door,1st,18,F,03-01-08-02-02-01-18.wav


# TO NUMPY

In [None]:
def pad_X(X, m_max, nan_value=0):
    return ak.fill_none(ak.pad_none(X, m_max, axis=2, clip=True), value=nan_value)

In [None]:
# find the max length of X_train
maximum = 0
for ts in X_train:
    length = len(np.asarray(np.ravel(ts)))
    if length > maximum:
        maximum = length
maximum

304304

In [None]:
%%time
X_train = np.squeeze(np.array(pad_X(X_train, maximum, np.nan)))
X_test = np.squeeze(np.array(pad_X(X_test, maximum, np.nan)))

CPU times: user 7.91 s, sys: 12.9 s, total: 20.8 s
Wall time: 24.3 s


In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1828, 304304)
(624, 304304)
(1828, 9)
(624, 9)


In [None]:
np.save("RavdessAudioOnlyNumpy__X_train.npy", X_train)
np.save("RavdessAudioOnlyNumpy__X_test.npy", X_test)

In [None]:
Y_train.to_csv("RavdessAudioOnlyNumpy__Y_train.csv", index=False)
Y_test.to_csv("RavdessAudioOnlyNumpy__Y_test.csv", index=False)

# FROM NUMPY

In [None]:
X_train = np.load("RavdessAudioOnlyNumpy__X_train.npy")
X_test = np.load("RavdessAudioOnlyNumpy__X_test.npy")
print(X_train.shape, X_test.shape)

(1828, 304304) (624, 304304)


In [None]:
X_train

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
                    nan,             nan,             nan],
       [ 0.00000000e+00,  0.00000000e+00, -3.05175781e-05, ...,
                    nan,             nan,             nan],
       [ 3.05175781e-05,  3.05175781e-05,  0.00000000e+00, ...,
                    nan,             nan,             nan],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
                    nan,             nan,             nan],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
                    nan,             nan,             nan],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
                    nan,             nan,             nan]])