In [None]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import glob
import matplotlib.pyplot as plt

In [None]:
audio_files = glob.glob("./ravdess-emotional-speech-audio/versions/1/*/*")
print(audio_files)

Each of the 1440 files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

Filename identifiers

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).

Vocal channel (01 = speech, 02 = song).

Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.

Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").

Repetition (01 = 1st repetition, 02 = 2nd repetition).

Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

Filename example: 03-01-06-01-02-01-12.wav

Audio-only (03)
Speech (01)
Fearful (06)
Normal intensity (01)
Statement "dogs" (02)
1st Repetition (01)
12th Actor (12)
Female, as the actor ID number is even.

So we have to make the outputs for each audio, it means we have to select only the emotion for each audio.

In [None]:
emotions = ["", "neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
Y = []
# We will iterate in a copy of audio_files to delete all the non-audio files
for audio in audio_files[:]:
    if not audio.endswith("wav"):
        audio_files.remove(audio)
    else:
        Y.append(int(audio.split("-")[-5]))

Y = np.array(Y)

print(len(audio_files))
print(Y)

Preparing my inputs data

In [None]:
def prepare_data(audio_files):
    X = []
    length = []
    for audio in audio_files:
        audio_file, m = librosa.load(audio, sr=22050)
        # We delete empty sound from begining and ending
        audio_file , _ = librosa.effects.trim(audio_file, top_db=10)

        # We make sure our sound is 3 seconds before applying all the other functions on it
        audio_file = librosa.util.fix_length(audio_file, size=3*22050)
        y = librosa.get_duration(path=audio)
        
        # We apply short time fourier transform
        audio_file_transformed = librosa.stft(audio_file)
        # We make the spectogram
        spectogram = np.abs(audio_file_transformed)
        spectogram_db = librosa.amplitude_to_db(spectogram)
        X.append(spectogram_db)
    
    return np.stack(X)

X = prepare_data(audio_files)

In [None]:
print(X/np.max(X))
print(X.shape)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
spec_image = librosa.display.specshow(X[0], x_axis="time", y_axis="log", ax=ax)

In [None]:
print(X.shape, X.size)
X = np.array([x.flatten() for x in X])
print(X.shape, X.size)

In [None]:
from tensorflow.keras import Input, layers, Model

In [None]:
print(X.shape[1:])

In [None]:
input = Input(shape=X.shape[1:])
layer1 = layers.Dense(32, activation="relu")(input)
layer2 = layers.Dense(64, activation="relu")(layer1)
output = layers.Dense(8, activation="softmax")(layer2)

model = Model(inputs=input, outputs=output)
model.summary()

Y = Y-1
X = X / np.max(X)

In [None]:
train_size = int(0.8*len(X))

X_train = X[:train_size]
Y_train = Y[:train_size]

X_test = X[train_size:]
Y_test = Y[train_size:]

In [None]:
from tensorflow import keras

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

history = model.fit(X_train, Y_train, batch_size=64, epochs=50, validation_split=0.2)

test_scores = model.evaluate(X_test, Y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])