# Emotion Recognition from Speech

Using neural networks, audio data from actors were used to train a model to associate emotions with speech.

<hr></hr>

## Module, Functions, and Definitions

In [30]:
import librosa  #  Analyze audio files
import soundfile  #  Read sound files
import os, glob, joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier  #  Multi-Layer Perceptron classifier (Artificial Neural Network)
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [3]:
# Function used to cycle through audio files and extract features to use to train model

def extract_feature(file_name):
    with soundfile.SoundFile(file_name) as sf:
        X = sf.read(dtype="float32")
        sample_rate = sf.samplerate
        
        result = np.array([])
        
# Mel-frequency cepstral coefficients - More accuratly represents human speech
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)

# Chromagram from a waveform - Returns normalized energy values for 12 distinct semitones
        stft = np.abs(librosa.stft(X))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
          
# Mel-scaled spectrogram - Frequency histogram based on percieved frequencies
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        
    return np.hstack((mfccs, chroma, mel))

In [4]:
# Dictionary of emotions from dataset

emotions = {
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

## Data Pre-processing

The audio files were obtained from the Ryerson Audio-Visiual Database of Emotional Speech and Song (RAVDESS), which consists of 24 actors saying the same short phrase in 8 different emotions.

Data was acquired here:  https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip?download=1

In [5]:
# Read through all audio files, extract features, and split into testing and training groups

X,y = [], []

for file in glob.glob("data/Actor_*/*.wav"):
    name = os.path.basename(file)
# Emotions are indicated with a number in the file name corresponding to that defined in our dictionary.
    emotion = emotions[name.split("-")[2]]
    feature = extract_feature(file)
    
    X.append(feature)
    y.append(emotion)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, stratify=y, random_state=42)

print(f'Number of training data: {X_train.shape[0]}\nNumber of testing data: {X_test.shape[0]}')

Number of training data: 1076
Number of testing data: 359


In [9]:
# Normalize testing and training audio feature data

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Classify emotions with number representation

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [11]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
print(f'Shape of data: {X_train_scaled[0].shape[0], y_train_categorical[0].shape[0]}')

Shape of data: (180, 8)


## Model Training

Two methods were used to train a model to fit the data: a TensorFlow neural network; and a Scikit-Learn neural network.

In [14]:
model = Sequential()
model.add(Dense(units=300, activation='relu', input_dim=180))
model.add(Dense(units=300, activation='relu'))
model.add(Dense(units=300, activation='relu'))
model.add(Dense(units=300, activation='relu'))
model.add(Dense(units=300, activation='relu'))
model.add(Dense(units=8, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 300)               54300     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_2 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_3 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_4 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_5 (Dense)              (None, 8)                 2408      
Total params: 417,908
Trainable params: 417,908
Non-trainable params: 0
__________________________________________________

In [17]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=300,
    shuffle=True,
    verbose=1
)

Train on 1076 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300

Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
E

Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 

Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 

Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<tensorflow.python.keras.callbacks.History at 0x162ac325f48>

In [33]:
# Test accuracy of model
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"TensorFlow Results - Loss: {model_loss}, Accuracy: {model_accuracy}")

359/359 - 0s - loss: 3.1276 - accuracy: 0.6908
TensorFlow Results - Loss: 3.1276437306470526, Accuracy: 0.6908078193664551


In [34]:
# Save model
model.save("tf_model.h5")

In [20]:
mlpModel = MLPClassifier(
    alpha = 0.01,
    batch_size = 256,
    epsilon = 1e-08,
    hidden_layer_sizes = (300,),
    learning_rate = 'adaptive',
    max_iter = 1000,
    #shuffle = False,
    #n_iter_no_change = 20,
    #learning_rate_init = 0.0001,
    #beta_1 = 0.1,
    #beta_2 = 0.9,
    warm_start = True
)

In [21]:
mlpModel.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=1000, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=True)

In [29]:
# Test accuracy of model
y_pred = mlpModel.predict(X_test)

print(f'MLP Results - Accuracy: {accuracy_score(y_true=y_test, y_pred=y_pred)}')

MLP Results - Accuracy: 0.4986072423398329


In [32]:
# Save model
joblib.dump(mlpModel, "mlp_model.sav")

['mlp_model.sav']