<a href="https://colab.research.google.com/github/karimhatem12/Speech-Emotion-Recognition-/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Speech Emotion Recogition (Classification) in real-time using Deep LSTM layers**
### ***A Deep Learning LSTM based model with keras.***
---

### Final project (B.Sc. requirement)  
Development by **Karim hatem hamed.**

Instructor: **Dr. Eslam Elshaarawy**

Computer Science.

MSA Universty , Egypt.





# **LIBRARIES & GOOGLE AUTH**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
 %%capture
!pip install pydub
!pip install pywt
!pip install noisereduce
!pip install json-tricks

In [None]:
%%capture
import numpy as np
import pandas as pd

import os
from json_tricks import dump, load

from pydub import AudioSegment, effects
import librosa
import noisereduce as nr
import pywt

import tensorflow as tf
import keras
import sklearn

In [None]:
# Emotion kind validation function for TESS database, due to emotions written within the file names.
def find_emotion_T(name): 
        if('neutral' in name): return "01"
        elif('NEU' in name): return "01"
        elif('happy' in name): return "03"
        elif('HAP' in name): return "03"
        elif('sad' in name): return "04"
        elif('SAD' in name): return "04"
        elif('angry' in name): return "05"
        elif('ANG' in name): return "05"
        elif('fear' in name): return "06"
        elif('FEA' in name): return "06"
        elif('disgust' in name): return "07"
        elif('DIS' in name): return "07"
        elif('ps' in name): return "08"
        else: return "-1"
        
 
# 'emotions' list fix for classification purposes:
#     Classification values start from 0, Thus an 'n = n-1' operation has been executed for both RAVDESS and TESS databases:
def emotionfix(e_num):
        if e_num == "01":   return 0 # neutral
        #elif e_num == "02": return 1 # calm
        elif e_num == "03": return 1 # happy
        elif e_num == "04": return 2 # sad
        elif e_num == "05": return 3 # angry
        elif e_num == "06": return 4 # fear
        elif e_num == "07": return 5 # disgust
        else:               return 6 # suprised

In [None]:

# Maximum samples count for padding purposes.

sample_lengths = []
# folder_path = '/content/drive/MyDrive/Colab_Notebooks/AudioFiles/TESS'

folder_path = '/content/drive/MyDrive/Colab_Notebooks/AudioFiles/Used Dataset'

for subdir, dirs, files in os.walk(folder_path):
  for file in files: 
    x, sr = librosa.load(path = os.path.join(subdir,file), sr = None)
    xt, index = librosa.effects.trim(x, top_db=30)
     
    sample_lengths.append(len(xt))

print('Maximum sample length:', np.max(sample_lengths))               



In [None]:
import time
tic = time.perf_counter()

# Initialize data lists
rms = []
zcr = []
mfcc = []
chroma = []
emotions = []

# Initialize variables
total_length = 228864 #228864  #305152  #5005152    # desired frame length for all of the audio samples.
frame_length = 2048
hop_length = 512

# folder_path = '/content/drive/MyDrive/Colab_Notebooks/AudioFiles/TESS' 
folder_path = '/content/drive/MyDrive/Colab_Notebooks/AudioFiles/Used Dataset'

for subdir, dirs, files in os.walk(folder_path):
  for file in files: 
    # Fetch the sample rate.
      _, sr = librosa.load(path = os.path.join(subdir,file), sr = None) # sr (the sample rate) is used for librosa's MFCCs. '_' is irrelevant.
    # Load the audio file.
      rawsound = AudioSegment.from_file(os.path.join(subdir,file)) 
    # Normalize the audio to +5.0 dBFS.
      normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the normalized audio to np.array of samples.
      normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
    # Trim silence from the beginning and the end.
      xt,  index = librosa.effects.trim(normal_x, top_db=30)
    # Pad for duration equalization. 
      padded_x = np.pad(xt, (0, total_length-len(xt)), 'constant')
    # Noise reduction.
      final_x = nr.reduce_noise(y=padded_x,y_noise=padded_x, sr=sr)

   # Features extraction 
      f1 = librosa.feature.rms(final_x, frame_length=frame_length, hop_length=hop_length) # Energy - Root Mean Square   
      f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True) # ZCR      
      f3 = librosa.feature.mfcc(final_x, sr=sr, n_mfcc=13, hop_length = hop_length) # MFCC
      f4 = librosa.feature.chroma_stft(final_x,sr=sr) #chroma
      
   # Emotion extraction from the different databases
      if (find_emotion_T(file) != "-1"): #TESS database validation
            name = find_emotion_T(file)
            
      else:                              #RAVDESS database validation
            name = file[6:8]
                                

   # Filling the data lists  
      rms.append(f1)
      zcr.append(f2)
      mfcc.append(f3)
      chroma.append(f4)
      emotions.append(emotionfix(name)) 
      

toc = time.perf_counter()
print(f"Running time: {(toc - tic)/60:0.4f} minutes")

In [None]:
# Adjusting features shape to the 3D format: (batch, timesteps, feature)

f_rms = np.asarray(rms).astype('float32')
f_rms = np.swapaxes(f_rms,1,2)
f_zcr = np.asarray(zcr).astype('float32')
f_zcr = np.swapaxes(f_zcr,1,2)
f_mfccs = np.asarray(mfcc).astype('float32')
f_mfccs = np.swapaxes(f_mfccs,1,2)
f_chroma = np.asarray(chroma).astype('float32')
f_chroma = np.swapaxes(f_chroma,1,2)

print('RMS shape:',f_rms.shape)
print('ZCR shape:',f_zcr.shape)
print('MFCCs shape:',f_mfccs.shape)
print('Chroma shape:',f_chroma.shape)

In [None]:
# Concatenating all features to 'X' variable.
X = np.concatenate(( f_rms,f_zcr,f_mfccs,  f_chroma), axis=2) #,
# Preparing 'Y' as a 2D shaped variable.
Y = np.asarray(emotions).astype('int8')
Y = np.expand_dims(Y, axis=1)

In [None]:
# Save X,Y arrays as lists to json files.

x_data = X.tolist() 
x_path = '/content/drive/My Drive/Colab_Notebooks/X_datanew.json' # FILE SAVE PATH
dump(obj = x_data, fp = x_path)

y_data = Y.tolist() 
y_path = '/content/drive/My Drive/Colab_Notebooks/Y_datanew.json' # FILE SAVE PATH
dump(obj = y_data, fp = y_path)


In [None]:
# Load X,Y json files back into lists, convert to np.arrays

x_path = '/content/drive/My Drive/Colab_Notebooks/X_datanew.json' # FILE LOAD PATH
X = load(x_path)
X = np.asarray(X, dtype = 'float32')

y_path = '/content/drive/My Drive/Colab_Notebooks/Y_datanew.json' # FILE LOAD PATH
Y = load(y_path)
Y = np.asarray(Y, dtype = 'int8')

In [None]:
# Split to train, validation, and test sets.
from sklearn.model_selection import train_test_split
x_train, x_tosplit, y_train, y_tosplit = train_test_split(X, Y, test_size = 0.125, random_state = 1)
x_val, x_test, y_val, y_test = train_test_split(x_tosplit, y_tosplit, test_size = 0.304, random_state = 1)

#'One-hot' vectors for Y: emotion 

y_train_class = tf.keras.utils.to_categorical(y_train, 7, dtype = 'int8')
y_val_class = tf.keras.utils.to_categorical(y_val, 7, dtype = 'int8')

In [None]:
# x_train, x_val, and x_test shape check.
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

In [None]:
# Save x_test, y_test to JSON.

file_path = 'x_test_data.json'
dump(obj = x_test, fp = file_path)

file_path = 'y_test_data.json'
dump(obj = y_test, fp = file_path)

In [None]:
from keras.models import Sequential
from keras import layers
from keras import optimizers
from keras import callbacks 

In [None]:
# Initializing the model

model = Sequential() #relu , siqmoid , software
model.add(layers.LSTM(64, return_sequences = True, input_shape=(X.shape[1:3])))
model.add(layers.LSTM(64))
model.add(layers.Dense(7, activation = 'relu'))



print(model.summary())

batch_size = 23

# Callbacks functions
checkpoint_path = '/content/drive/My Drive/Colab_Notebooks/best_weights.hdf5'
model.load_weights(checkpoint_path)


#-> Save the best weights
mcp_save = callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True,
                           monitor='val_categorical_accuracy',
                           mode='max')
#-> Reduce learning rate after 100 epoches without improvement.
rlrop = callbacks.ReduceLROnPlateau(monitor='val_categorical_accuracy', 
                                    factor=0.1, patience=100)
                             
# Compile & train   
model.compile(loss='categorical_crossentropy', 
                optimizer='RMSProp', 
                metrics=['categorical_accuracy'])

history = model.fit(x_train, y_train_class, #shuffle= True, btsh2lb al data
                      epochs=340, batch_size = batch_size,   #340   
                      validation_data = (x_val, y_val_class), 
                      callbacks = [mcp_save, rlrop])
# Define the best weights to the model.
model.load_weights(checkpoint_path)

print(f"Running time: {(toc - tic)/60:0.4f} minutes")

In [None]:
# checkpoint_path = '/content/drive/My Drive/Colab_Notebooks/best_weights.hdf5'

# model = Sequential()
# model.add(layers.LSTM(64, return_sequences = True, input_shape=(X.shape[1:3])))
# model.add(layers.LSTM(64))
# model.add(layers.Dense(7, activation = 'softmax'))

# model.compile(loss='categorical_crossentropy', 
#                 optimizer='RMSProp', 
#                 metrics=['categorical_accuracy'])


# model.load_weights(checkpoint_path)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# Loss, Accuracy presentation

# Plot history: Loss
plt.plot(history.history['loss'], label='Loss (training data)')
plt.plot(history.history['val_loss'], label='Loss (validation data)')
plt.title('Loss for train and validation')
plt.ylabel('Loss value')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

#Plot history: Accuracy
plt.plot(history.history['categorical_accuracy'], label='Acc (training data)')
plt.plot(history.history['val_categorical_accuracy'], label='Acc (validation data)')
plt.title('Model accuracy')
plt.ylabel('Acc %')
plt.xlabel('No. epoch')
plt.legend(loc="upper left")
plt.show()

In [None]:
# Validation score
loss,acc = model.evaluate(x_val, y_val_class, verbose=2)

In [None]:
y_val_class.shape
x_val.shape

In [None]:
# Validation Confusion matrix

y_val_class = np.argmax(y_val_class, axis=1)
predictions = model.predict(x_val)
y_pred_class = np.argmax(predictions, axis=1)
cm=confusion_matrix(y_val_class, y_pred_class)

# index = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  
# columns = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  

index = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  
columns = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  

cm_df = pd.DataFrame(cm,index,columns)
plt.figure(figsize=(12,7))
ax = plt.axes()

sns.heatmap(cm_df, ax = ax, cmap = 'PuBu', fmt="d", annot=True) 
ax.set_ylabel('True emotion')
ax.set_xlabel('Predicted emotion')

In [None]:
# Validation set prediction accuracy rates

values = cm.diagonal()
print(values) 
row_sum = np.sum(cm,axis=1)
print(row_sum)
acc = values / row_sum

print('Validation set predicted emotions accuracy:')
for e in range(0, len(values)):
    print(index[e],':', f"{(acc[e]):0.4f}")

In [None]:
# Saving model & weights

from keras.models import model_from_json
from keras.models import load_model

model_json = model.to_json()
saved_model_path = '/content/drive/My Drive/Colab_Notebooks/model8723.json'
saved_weights_path = '/content/drive/My Drive/Colab_Notebooks/model8723_weights.h5'


with open(saved_model_path, "w") as json_file:
    json_file.write(model_json)
    
model.save_weights(saved_weights_path)
print("Saved model to disk")


In [None]:
# Reading the model from JSON file

saved_model_path = '/content/drive/MyDrive/Colab_Notebooks/model8723.json'
saved_weights_path = '/content/drive/MyDrive/Colab_Notebooks/model8723_weights.h5'

with open(saved_model_path , 'r') as json_file:
    json_savedModel = json_file.read()
    
# Loading the model architecture, weights
model = tf.keras.models.model_from_json(json_savedModel)
model.load_weights(saved_weights_path)

# Compiling the model with similar parameters as the original model.
model.compile(loss='categorical_crossentropy', 
                optimizer='RMSProp', 
                metrics=['categorical_accuracy'])

# Model's structure visualization
tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

In [None]:
# Loading x_test, y_test json files, and converting to np.arrays

x_test = load( 'x_test_data.json')
x_test = np.asarray(x_test).astype('float32')

y_test = load('y_test_data.json')
y_test = np.asarray(y_test).astype('int8')

y_test_class = tf.keras.utils.to_categorical(y_test, 7, dtype = 'int8')

In [None]:
loss, acc = model.evaluate(x_test, y_test_class, verbose=2)

In [None]:
# Test set Confusion matrix
print(x_test.shape)
y = np.argmax(y_test_class, axis=1)
predictions = model.predict(x_test)
y_pred_class = np.argmax(predictions, axis=1)

cm=confusion_matrix(y, y_pred_class)

# index = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  
# columns = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  

index = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  
columns = ['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  

 
cm_df = pd.DataFrame(cm,index,columns)                      
plt.figure(figsize=(12,7))
ax = plt.axes()

sns.heatmap(cm_df, ax = ax, cmap = 'BuGn', fmt="d", annot=True)
ax.set_ylabel('True emotion')
ax.set_xlabel('Predicted emotion')

In [None]:
  # Test set prediction accuracy rates

values = cm.diagonal()
row_sum = np.sum(cm,axis=1)
acc = values / row_sum

print('Test set predicted emotions accuracy:')
for e in range(0, len(values)):
    print(index[e],':', f"{(acc[e]):0.4f}")

In [None]:
subdir = '/content/drive/MyDrive/Colab_Notebooks/AudioFiles/Used Dataset'
traning = ["03-01-07-02-01-01-23.wav", "1013_TSI_SAD_XX.wav", "YAF_young_happy.wav"]

# subdir = "/content/drive/MyDrive/Colab_Notebooks/AudioFiles"
# traning = ["ode.wav"]

for file in traning:
    rms = []
    zcr = []
    mfcc = []
    chroma = []
    # emotions = []

    # Initialize variables
    total_length = 228864 #228864  #305152  #5005152    # desired frame length for all of the audio samples.
    frame_length = 2048
    hop_length = 512


    # Fetch the sample rate.
    _, sr = librosa.load(path = os.path.join(subdir,file), sr = None) # sr (the sample rate) is used for librosa's MFCCs. '_' is irrelevant.
    # Load the audio file.
    rawsound = AudioSegment.from_file(os.path.join(subdir,file)) 
    # Normalize the audio to +5.0 dBFS.
    normalizedsound = effects.normalize(rawsound, headroom = 5.0) 
    # Transform the normalized audio to np.array of samples.
    normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
    # Trim silence from the beginning and the end.
    xt,  index = librosa.effects.trim(normal_x, top_db=30)
    # Pad for duration equalization.
    # print(xt.shape)
    padded_x = np.pad(xt, (0, total_length-len(xt)), 'constant')
    # Noise reduction.
    final_x = nr.reduce_noise(y=padded_x,y_noise=padded_x, sr=sr)

    # Features extraction 
    f1 = librosa.feature.rms(final_x, frame_length=frame_length, hop_length=hop_length) # Energy - Root Mean Square
    f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True) # ZCR
    f3 = librosa.feature.mfcc(final_x, sr=sr, n_mfcc=13, hop_length = hop_length) # MFCC
    f4 = librosa.feature.chroma_stft(final_x,sr=sr) #chroma
      

    # Emotion extraction from the different databases
    # if (find_emotion_T(file) != "-1"): #TESS database validation
    #       name = find_emotion_T(file)
    # else:                              #RAVDESS database validation
    #       name = file[6:8]                      
    # print(name);
    # Filling the data lists  
    rms.append(f1)
    zcr.append(f2)
    mfcc.append(f3)
    chroma.append(f4)
    # emotions.append((name)) 

    # Adjusting features shape to the 3D format: (batch, timesteps, feature)

    f_rms = np.asarray(rms).astype('float32')
    f_rms = np.swapaxes(f_rms,1,2)
    f_zcr = np.asarray(zcr).astype('float32')
    f_zcr = np.swapaxes(f_zcr,1,2)
    f_mfccs = np.asarray(mfcc).astype('float32')
    f_mfccs = np.swapaxes(f_mfccs,1,2)
    f_chroma = np.asarray(chroma).astype('float32')
    f_chroma = np.swapaxes(f_chroma,1,2)

    # Concatenating all features to 'X' variable.
    X = np.concatenate(( f_rms,f_zcr,f_mfccs,  f_chroma), axis=2) #,

    # Preparing 'Y' as a 2D shaped variable.
    # Y = np.asarray(emotions).astype('int8')
    # Y = np.expand_dims(Y, axis=1)


    # y = np.argmax(Y, axis=1)
    predictions = model.predict(X)
    # y_pred_class = np.argmax(predictions, axis=1)

    emotions = {
        0 : 'neutral',
        1 : 'happy',
        2 : 'sad',
        3 : 'angry',
        4 : 'fearful',  
        5 : 'disgust',
        6 : 'suprised'   
    }  
    # emo_list = list(emotions.values())

    max_emo = np.argmax(predictions)
    print('max emotion:', emotions.get(max_emo,-1))
