In [None]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import glob
import matplotlib.pyplot as plt
from tensorflow import keras

In [None]:
audio_files = glob.glob("./archive/*/*")
print(audio_files)

Each of the 1440 files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

Filename identifiers

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).

Vocal channel (01 = speech, 02 = song).

Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.

Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").

Repetition (01 = 1st repetition, 02 = 2nd repetition).

Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

Filename example: 03-01-06-01-02-01-12.wav

Audio-only (03)
Speech (01)
Fearful (06)
Normal intensity (01)
Statement "dogs" (02)
1st Repetition (01)
12th Actor (12)
Female, as the actor ID number is even.

So we have to make the outputs for each audio, it means we have to select only the emotion for each audio.

In [None]:
emotions = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
Y = []
# We will iterate in a copy of audio_files to delete all the non-audio files
for audio in audio_files[:]:
    if not audio.endswith("wav"):
        audio_files.remove(audio)
    else:
        Y.append(int(audio.split("-")[-5]) -1)

Y = np.array(Y)
Y = Y.astype(np.int32)

print(len(audio_files))
print(Y)

# Preparing my inputs data

In [None]:
def prepare_data(audio_path, sr=22050):
    audio_file, m = librosa.load(path=audio_path, sr=sr, duration=3.5)

    # We make sure our sound is 3.5 seconds before applying all the other functions on it
    if(len(audio_file) < int(sr*3.5)):
        audio_file = librosa.util.fix_length(audio_file, size=int(3.5*sr))
    
    mel_spec = librosa.feature.melspectrogram(y=audio_file, n_mels=256, sr=sr)
    mel_spectogram_db = librosa.power_to_db(mel_spec)
    # normalize the sample
    mel_spectogram_db = mel_spectogram_db.astype(np.float32)
    
    return mel_spectogram_db

In [None]:
def prepare_data_set(audio_files):
    X = []

    for audio in audio_files:
        X.append(prepare_data(audio))

    return np.stack(X)

X = prepare_data_set(audio_files)
X = (X - np.min(X)) / (np.max(X) - np.min(X))

# Viewing some examples

In [None]:
index = 90

audio_file, m = librosa.load(audio_files[index], sr=22050)
print(len(audio_file))

fig, ax = plt.subplots(figsize=(10, 8))
plt.plot(audio_file)
plt.show()

# Viewing spectrograms and melspectrograms

In [None]:
 # We apply short time fourier transform
audio_file_transformed = librosa.stft(audio_file)
# We make the spectogram
spectogram = np.abs(audio_file_transformed)
spectogram_db = librosa.amplitude_to_db(spectogram)
fig, ax = plt.subplots(figsize=(10, 8))
sp = librosa.display.specshow(spectogram_db, x_axis="time", y_axis="log", ax=ax)
ax.set_title(emotions[Y[index]])

In [None]:
# using mel spectrogram:
my_example = prepare_data(audio_files[index])
print(my_example)
fig, ax = plt.subplots(figsize=(10, 8))
sp = librosa.display.specshow(my_example, x_axis="time", y_axis="log", ax=ax)
ax.set_title(emotions[Y[index]] + " mel_spec")

# Creating Model

In [None]:
model = keras.Sequential()

model.add(keras.layers.Input(X.shape[1:] + (1,)))

model.add(keras.layers.Conv2D(32 , (3,3), activation="relu"))
model.add(keras.layers.AveragePooling2D(pool_size=(2,2)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Conv2D(64 , (3,3), activation="relu"))
model.add(keras.layers.AveragePooling2D(pool_size=(2,2)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Conv2D(128, (3,3), activation="relu"))
model.add(keras.layers.AveragePooling2D(pool_size=(2,2)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Conv2D(256, (3,3), activation="relu"))
model.add(keras.layers.AveragePooling2D(pool_size=(2,2)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.2))

model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(512,  activation="relu"))
model.add(keras.layers.Dense(8, activation='softmax'))
model.summary()

In [None]:
train_size = int(0.8*len(X))

X_train = X[:train_size]
Y_train = Y[:train_size]

X_test = X[train_size:]
Y_test = Y[train_size:]

In [None]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

checkpoint = keras.callbacks.ModelCheckpoint(filepath="model.keras", monitor="val_accuracy", mode="max")

In [None]:
np.max(Y)

In [None]:
history = model.fit(X_train, Y_train, batch_size=64, epochs=100, callbacks=checkpoint, validation_split=0.2)

test_scores = model.evaluate(X_test, Y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

In [None]:
test_scores = model.evaluate(X_train, Y_train, verbose=2)
print(test_scores)

In [None]:
test_scores = model.evaluate(X_test, Y_test, verbose=2)
print(test_scores)