In [74]:
directory = './EmoDB_dataset/wav'

# MFCC features

## Feature extraction

In [75]:
import os
import librosa
import numpy as np
from typing import Dict

def extract_mfcc_features(file_path: str, n_mfcc: int = 39, 
                          frame_size: float = 0.025, frame_stride: float = 0.01, 
                          n_segments: int = 10) -> np.ndarray:
    """
    Extracts 39 MFCC features framewise from an audio file and then applies
    average pooling to condense the features over time into an n x 39 feature matrix.
    
    Parameters:
      file_path (str): Path to the audio file.
      n_mfcc (int): Number of MFCC features to extract. Default is 39.
      frame_size (float): Length of each frame in seconds. Default is 0.025.
      frame_stride (float): Step between successive frames in seconds. Default is 0.01.
      n_segments (int): Number of segments (n) to pool the frames into.
    
    Returns:
      np.ndarray: A n x 39 array where each row is the average MFCC vector for that segment.
    """
    try:
        signal, sample_rate = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sample_rate)
        hop_length = int(frame_stride * sample_rate)
        
        # Extract MFCC features; result shape is (n_mfcc, T) where T is number of frames.
        mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc,
                                    n_fft=frame_length, hop_length=hop_length)
        
        # Normalize the MFCC features along each coefficient dimension.
        mfcc_normalized = mfcc - np.mean(mfcc, axis=1, keepdims=True)
        
        # Transpose to shape (T, n_mfcc) for pooling along the time axis.
        mfcc_normalized = mfcc_normalized.T
        
        # Divide the frames into n_segments segments and compute the average for each segment.
        segments = np.array_split(mfcc_normalized, n_segments, axis=0)
        pooled_features = np.array([np.mean(seg, axis=0) for seg in segments])
        
        return pooled_features  # Shape: (n_segments, 39)
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return np.array([])

def process_directory_mfcc(directory: str, n_segments: int = 10) -> Dict[str, np.ndarray]:
    """
    Processes all .wav files in the given directory, extracting their MFCC features
    using average pooling to produce an n x 39 feature matrix for each file.
    
    Parameters:
      directory (str): Path to the directory containing .wav files.
      n_segments (int): Number of segments to pool the frames into for each file.
    
    Returns:
      Dict[str, np.ndarray]: A dictionary mapping filenames to their corresponding feature matrices.
    """
    feature_vectors = {}
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            features = extract_mfcc_features(file_path, n_segments=n_segments)
            if features.size > 0:
                feature_vectors[filename] = features
    return feature_vectors


In [76]:
n = 85
mfccFeatures = process_directory_mfcc(directory, n)

len(mfccFeatures)

535

In [77]:
print(mfccFeatures["03a01Fa.wav"].shape)
print(mfccFeatures["03a02Fc.wav"].shape)

(85, 39)
(85, 39)


## Classifier on MFCC

In [78]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


def load_labels(csv_file: str) -> pd.DataFrame:
    return pd.read_csv(csv_file)

def prepare_dataset(features: dict, labels: pd.DataFrame):
    """
    Constructs the dataset by matching each audio file's feature matrix with its label.
    Since each file is represented as an n x 39 matrix (n segments by 39 features),
    we flatten it into a 1D feature vector of length n*39.
    """
    X = []
    y = []
    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            # Flatten the (n, 39) matrix to a 1D vector (n*39,)
            feature_matrix = features[file_id]
            feature_vector = feature_matrix.flatten()
            X.append(feature_vector)
            y.append(int(row['EmotionNumeric']))
    return np.array(X), np.array(y)

def train_and_evaluate(X, y):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train GMM Classifier
    gmm_model = GaussianMixture(n_components=len(np.unique(y)), random_state=42)
    gmm_model.fit(X_train)
    gmm_predictions = gmm_model.predict(X_test)

    # Train SVM Classifier
    svm_model = SVC(kernel='linear', random_state=42)
    svm_model.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)

    # Evaluate classifiers
    print("GMM Classifier Report:")
    print(classification_report(y_test, gmm_predictions))

    print("SVM Classifier Report:")
    print(classification_report(y_test, svm_predictions))

    # Optionally save models
    # joblib.dump(gmm_model, 'gmm_model.pkl')
    # joblib.dump(svm_model, 'svm_model.pkl')

In [79]:
labels_csv_path = "EmoDB_dataset/emotion_mapping_detailed.csv"
labels = load_labels(labels_csv_path)

# Prepare the dataset: each feature matrix is flattened to become a vector
X, y = prepare_dataset(mfccFeatures, labels)
print("Dataset shape:", X.shape)

train_and_evaluate(X, y)


Dataset shape: (535, 3315)




GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.00      0.00      0.00        20
           2       0.11      0.25      0.15        12
           3       0.15      0.21      0.18        14
           4       0.06      0.06      0.06        18
           5       0.09      0.22      0.12         9
           6       0.15      0.12      0.14        16

    accuracy                           0.10       107
   macro avg       0.08      0.12      0.09       107
weighted avg       0.07      0.10      0.08       107

SVM Classifier Report:
              precision    recall  f1-score   support

           0       0.48      0.78      0.60        18
           1       0.75      0.60      0.67        20
           2       0.50      0.08      0.14        12
           3       0.33      0.29      0.31        14
           4       0.67      0.56      0.61        18
           5       0.50      1.0