# Voice Emotion Recognition: Detection

#### Detects emotions in real time based on extracted audio features (feature_extractiton.ipynb) and makes predictions using the trained model (feature_extractiton.ipynb)

In [233]:
# Import libraries
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model
import os

#### Import audio feature dataset

In [234]:
# Import the extracted audio features from 'feature_extraction.ipynb' notebook
# The file is approx. 9.4 GB, so it might take a while to load 
feature_df = pd.read_csv('./extracted_features.csv')
feature_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2675,2676,2677,2678,2679,2680,2681,2682,2683,Emotion
0,0.058594,0.092285,0.115723,0.106445,0.087891,0.066406,0.049805,0.043945,0.038086,0.054199,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,angry
1,0.129883,0.161621,0.187012,0.113281,0.092773,0.073242,0.057617,0.061523,0.084961,0.147949,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,angry
2,0.058594,0.092285,0.115723,0.106445,0.087891,0.066406,0.049805,0.043945,0.038086,0.054199,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,angry
3,0.083984,0.117676,0.141113,0.109375,0.089844,0.068359,0.051758,0.045898,0.041016,0.061035,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,angry
4,0.061035,0.092285,0.115723,0.103516,0.084961,0.063965,0.048340,0.041992,0.035645,0.050781,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,angry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235471,0.083984,0.133789,0.161133,0.153320,0.117188,0.093750,0.086426,0.095215,0.108398,0.132812,...,-11.996235,-4.265385,-1.865951,-2.166432,-17.060484,-9.683095,-0.542660,-0.719571,-6.080200,happy
235472,0.046875,0.078125,0.099609,0.097656,0.076172,0.052734,0.036621,0.032715,0.029297,0.035156,...,-11.155140,-3.369991,-1.089657,0.174891,-19.147015,-12.684088,0.123865,-1.245382,-5.587992,happy
235473,0.085938,0.145508,0.187500,0.176758,0.139648,0.121094,0.112793,0.130859,0.153809,0.174805,...,-12.238156,-2.266622,0.860346,-1.415168,-17.360427,-9.504957,0.042696,-0.656188,-5.901685,happy
235474,0.045898,0.076172,0.097656,0.094727,0.074219,0.051758,0.035645,0.032715,0.029297,0.034180,...,-4.817496,-7.855257,-4.327174,-6.591506,-18.465111,-14.346745,0.730882,-8.990791,-5.316560,happy


## Data preprocessing

In [235]:
# Define extracted audio features (X) and emotion labels (Y)
features = feature_df.iloc[:, :-1].values  # All comlumns feature columns
encoder = OneHotEncoder(sparse_output=False)
labels = encoder.fit_transform(feature_df[['Emotion']]) # One-hot encode emotions labels and reshape

# Split data into training and test sets (80/20%)
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, shuffle=True)

# Scale the data using StandardScaler and reshape 
scaler = StandardScaler()
X_train = np.expand_dims(scaler.fit_transform(x_train), axis=-1) # Training data
X_test = np.expand_dims(scaler.transform(x_test), axis=-1) # Test data

# Print shapes of training and test data
print(f"Training shape:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Test shape:")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

Training shape:
X_train: (188380, 2684, 1), y_train: (188380, 7)
Test shape:
X_test: (47096, 2684, 1), y_test: (47096, 7)


## Data augmentation and audio feature extraction

In [236]:
# Import augmentation function from the 'feature_extraction'  notebook.
import nbimporter
from feature_extraction import add_noise, dynamic_compression, pitch_shift

In [237]:
# Modified audio features extraction function from the  'extract_audio_features' notebook.
def extract_audio_features(data, sr=22050, frame_length=2048, hop_length=512):
    augmentations = [
        ("Original", lambda x: x),  # Original audio
        ("Noise", add_noise),  # Noise augmentation
        ("Dynamic Compression", lambda x: dynamic_compression(x)),  # Dynamic compression augmentation
        ("Dynamic Compression and Noise", lambda x: add_noise(dynamic_compression(x))),  # Dynamic compression + noise augmentation
        ("Pitch Shift", lambda x: pitch_shift(x, sr)),  # Pitch shift augmentation
        ("Pitch Shift and Noise", lambda x: add_noise(pitch_shift(x, sr))),  # Pitch shift + noise augmentation
    ]

    # Initialize list for extracted audio features
    audio_features = [] 

    # Iterate over augmentations to apply and extract audio features
    for name, augmentation in augmentations:
        try:
            augmented_data = augmentation(data)
            
            # Extract features (ZCR, RMS, MFCC)
            features = np.hstack((
                np.squeeze(librosa.feature.zero_crossing_rate(augmented_data, frame_length=frame_length, hop_length=hop_length)),
                np.squeeze(librosa.feature.rms(y=augmented_data, frame_length=frame_length, hop_length=hop_length)),
                np.squeeze(np.ravel(librosa.feature.mfcc(y=augmented_data, sr=sr, n_fft=frame_length, hop_length=hop_length).T))
            ))

            # Ensure the feature vector has the expected length (2684)
            if features.size < 2684:
                #print(f"Warning: Feature size is smaller than expected for {name}, padding with zeros.")
                # Pad with zeros if the number of features is less than 2684
                features = np.pad(features, (0, 2684 - features.size), mode='constant')
            audio_features.append(features)
            
        except Exception as e:
            print(f"Error processing {name} audio: {e}")
            continue
            
    return np.array(audio_features[-1])


## Prediction

In [238]:
# Load the model saved using the  'voice_emotion_model.ipynb' notebook.
model = load_model('emotion_model_CSV.h5')

In [239]:
# Loads audio, extracts features, reshape, and make predictions on input audio file.
def prediction(path):
    # Load the audio file
    audio_data, sample_rate = librosa.load(path, duration=2.81, offset=0.39)
    
    # Extract audio features
    features = extract_audio_features(audio_data)
        
    # Reshape and scale the extracted features
    features = np.reshape(features, newshape=(1, -1))  # Flatten features
    scaled_features = scaler.transform(features)       # Scaling
    final_features = np.expand_dims(scaled_features, axis=2)  # Add dimension

    # Make prediction using the model
    predictions = model.predict(final_features, verbose = 0)
    
    # Inverse transform to get the predicted emotion
    predicted_emotion = encoder.inverse_transform(predictions)
    
    # return the predicted emotion
    return predicted_emotion[0][0]


## Evaluation

In [240]:
# Evaluates predictions and returns summary
def evaluate_predictions(audio_files):
    # Initialize summary counter
    summary = {label: {"correct": 0, "total": len(files)} for label, files in audio_files.items()}

    # Process files and update summary
    for label, files in audio_files.items():
        for file in files:
            pred = prediction(file)
            #print(f"File: {file}, Predicted: {pred}, Expected: {label}")
            if pred is not None and pred.lower() == label.lower():
                summary[label]["correct"] += 1


    # Compute overall accuracy
    total_correct = sum(counts["correct"] for counts in summary.values())
    total_files = sum(counts["total"] for counts in summary.values())
    overall_accuracy = (total_correct / total_files * 100) if total_files > 0 else 0

    # Print summary
    for emotion, counts in summary.items():
        correct, total = counts["correct"], counts["total"]
        accuracy = (correct / total * 100) if total > 0 else 0
        print(f"{emotion}: {correct}/{total} correct ({accuracy:.2f}%)")
    print(f"\nOverall Accuracy: {total_correct}/{total_files} ({overall_accuracy:.2f}%)")

### Predicting audio files from the CREMA-D dataset

In [259]:
# Dictionary with audio fila paths for CREMA-D
crema_files = {
    "Neutral": ["CREMA/AudioWAV/1001_IWW_NEU_XX.wav", "CREMA/AudioWAV/1069_IWL_NEU_XX.wav"],
    "Happy": ["CREMA/AudioWAV/1009_IWL_HAP_XX.wav", "CREMA/AudioWAV/1083_IOM_HAP_XX.wav"],
    "Sad": ["CREMA/AudioWAV/1040_WSI_SAD_XX.wav", "CREMA/AudioWAV/1091_WSI_SAD_XX.wav"],
    "Angry": ["CREMA/AudioWAV/1091_IWL_ANG_XX.wav", "CREMA/AudioWAV/1013_IOM_ANG_XX.wav"],
    "Fear": ["CREMA/AudioWAV/1013_IOM_FEA_XX.wav", "CREMA/AudioWAV/1046_DFA_FEA_XX.wav"],
    "Disgust": ["CREMA/AudioWAV/1002_IWW_DIS_XX.wav", "CREMA/AudioWAV/1089_TAI_DIS_XX.wav"]
}

# Make predictions and evaluate
evaluate_predictions(crema_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Fear: 2/2 correct (100.00%)
Disgust: 2/2 correct (100.00%)

Overall Accuracy: 12/12 (100.00%)


### Predicting audio files from the RAVDESS dataset

In [242]:
# Dictionary with audio fila paths for RAVDESS dataset
rav_files = {
    "Neutral": ["RAVDESS/Actor_01/03-01-01-01-02-02-01.wav", "RAVDESS/Actor_08/03-01-01-01-02-02-08.wav"],
    "Happy": ["RAVDESS/Actor_16/03-01-03-02-01-02-16.wav", "RAVDESS/Actor_14/03-01-03-01-02-02-14.wav"],
    "Sad": ["RAVDESS/Actor_10/03-01-04-01-01-02-10.wav", "RAVDESS/Actor_24/03-01-04-01-02-01-24.wav"],
    "Angry": ["RAVDESS/Actor_13/03-01-05-02-02-01-13.wav", "RAVDESS/Actor_03/03-01-05-02-02-01-03.wav"],
    "Fear": ["RAVDESS/Actor_18/03-01-06-02-01-01-18.wav", "RAVDESS/Actor_07/03-01-06-02-02-01-07.wav"],
    "Disgust": ["RAVDESS/Actor_09/03-01-07-02-01-01-09.wav", "RAVDESS/Actor_22/03-01-07-02-01-01-22.wav"],
    "Surprise": ["RAVDESS/Actor_08/03-01-08-01-02-02-08.wav", "RAVDESS/Actor_14/03-01-08-01-02-02-14.wav"],
}

# Make predictions and evaluate
evaluate_predictions(rav_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Fear: 2/2 correct (100.00%)
Disgust: 2/2 correct (100.00%)
Surprise: 2/2 correct (100.00%)

Overall Accuracy: 14/14 (100.00%)


### Predicting audio files from the SAVEE dataset

In [243]:
# Dictionary with audio fila paths for SAVEE dataset
sav_files = {
    "Neutral": ["SAVEE/ALL/JK_n05.wav", "SAVEE/ALL/DC_n09.wav"],
    "Happy": ["SAVEE/ALL/JE_h02.wav", "SAVEE/ALL/KL_h01.wav"],
    "Sad": ["SAVEE/ALL/DC_sa12.wav", "SAVEE/ALL/JK_sa14.wav"],
    "Angry": ["SAVEE/ALL/DC_a04.wav", "SAVEE/ALL/KL_a05.wav"],
    "Fear": ["SAVEE/ALL/JK_f11.wav", "SAVEE/ALL/JE_f03.wav"],
    "Disgust": ["SAVEE/ALL/DC_d07.wav", "SAVEE/ALL/KL_d07.wav"],
    "Surprise": ["SAVEE/ALL/DC_su07.wav", "SAVEE/ALL/JK_su03.wav"]

}

# Make predictions and evaluate
evaluate_predictions(sav_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Fear: 2/2 correct (100.00%)
Disgust: 2/2 correct (100.00%)
Surprise: 2/2 correct (100.00%)

Overall Accuracy: 14/14 (100.00%)


### Predicting audio files from the TESS dataset

In [244]:
# Dictionary with audio fila paths for SAVEE dataset
tes_files = {
    "Neutral": ["TESS/YAF_neutral/YAF_chain_neutral.wav", "TESS/OAF_neutral/OAF_have_neutral.wav"],
    "Happy": ["TESS/YAF_happy/YAF_fat_happy.wav", "TESS/OAF_happy/OAF_home_happy.wav"],
    "Sad": ["TESS/YAF_sad/YAF_shawl_sad.wav", "TESS/OAF_sad/OAF_wire_sad.wav"],
    "Angry": ["TESS/YAF_angry/YAF_judge_angry.wav", "TESS/OAF_angry/OAF_pass_angry.wav"],
    "Fear": ["TESS/YAF_fear/YAF_home_fear.wav", "TESS/OAF_fear/OAF_choice_fear.wav"],
    "Disgust": ["TESS/YAF_disgust/YAF_ring_disgust.wav", "TESS/OAF_disgust/OAF_goal_disgust.wav"],
    "Surprise": ["TESS/YAF_pleasant_surprised/YAF_hurl_ps.wav", "TESS/OAF_Pleasant_surprise/OAF_pick_ps.wav"],

}

# Make predictions and evaluate
evaluate_predictions(tes_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Fear: 2/2 correct (100.00%)
Disgust: 2/2 correct (100.00%)
Surprise: 2/2 correct (100.00%)

Overall Accuracy: 14/14 (100.00%)


### Predicting audio files from the ESD dataset

In [245]:
# Dictionary with audio fila paths for SAVEE dataset
esd_files = {
    "Neutral": ["ESD/0011/Neutral/0011_000103.wav", "ESD/0018/Neutral/0018_000055.wav"],
    "Happy": ["ESD/0020/Happy/0020_000756.wav", "ESD/0017/Happy/0017_000860.wav"],
    "Sad": ["ESD/0015/Sad/0015_001392.wav", "ESD/0017/Sad/0017_001183.wav"],
    "Angry": ["ESD/0017/Angry/0017_000363.wav", "ESD/0011/Angry/0011_000645.wav"],
    "Surprise": ["ESD/0013/Surprise/0013_001415.wav", "ESD/0020/Surprise/0020_001461.wav"],

}

# Make predictions and evaluate
evaluate_predictions(esd_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Surprise: 2/2 correct (100.00%)

Overall Accuracy: 10/10 (100.00%)


### Predicting audio files from the MELD dataset

In [246]:
# Dictionary with audio fila paths for SAVEE dataset
mel_files = {
    "Neutral": ["MELD/train_splits_audio/dia0_utt1.wav", "MELD/train_splits_audio/dia620_utt0.wav"],
    "Happy": ["MELD/train_splits_audio/dia733_utt0.wav", "MELD/train_splits_audio/dia33_utt1.wav"],
    "Sad": ["MELD/train_splits_audio/dia82_utt4.wav", "MELD/train_splits_audio/dia935_utt4.wav"],
    "Angry": ["MELD/train_splits_audio/dia933_utt11.wav", "MELD/train_splits_audio/dia383_utt3.wav"],
    "Fear": ["MELD/train_splits_audio/dia445_utt3.wav", "MELD/train_splits_audio/dia120_utt0.wav"],
    "Disgust": ["MELD/train_splits_audio/dia109_utt1.wav", "MELD/train_splits_audio/dia1038_utt7.wav"],
    "Surprise": ["MELD/train_splits_audio/dia1038_utt15.wav", "MELD/train_splits_audio/dia1019_utt9.wav"]
}

# Make predictions and evaluate
evaluate_predictions(mel_files)

Neutral: 2/2 correct (100.00%)
Happy: 2/2 correct (100.00%)
Sad: 2/2 correct (100.00%)
Angry: 2/2 correct (100.00%)
Fear: 2/2 correct (100.00%)
Disgust: 2/2 correct (100.00%)
Surprise: 2/2 correct (100.00%)

Overall Accuracy: 14/14 (100.00%)


### Predicting recorded audio files

In [247]:
prediction("recordings/neutral.wav")

'neutral'

In [248]:
prediction("recordings/happy.wav")


'happy'

In [249]:
prediction("recordings/sad.wav")


'sad'

In [258]:
prediction("recordings/angry.wav")


'angry'

In [251]:
prediction("recordings/fear.wav")

'fear'

In [255]:
prediction("recordings/disgust.wav")

'disgust'

In [253]:
prediction("recordings/surprise.wav")

'surprise'

## Conclusion

We expect the model to perform really well and give very accurate predictions for all emotions, given the performance of the model. All predictions made are accurate. This means that the model performs extremely well on both seen and unseen real world data.