In [3]:
directory = './EmoDB_dataset/wav'

# MFCC features

## Feature extraction

In [4]:
import os
import librosa
import numpy as np
from typing import Dict

def extract_mfcc_features(file_path: str, n_mfcc: int = 39, 
                          frame_size: float = 0.025, frame_stride: float = 0.01, 
                          n_segments: int = 10) -> np.ndarray:
    """
    Extracts 39 MFCC features framewise from an audio file and then applies
    average pooling to condense the features over time into an n x 39 feature matrix.
    
    Parameters:
      file_path (str): Path to the audio file.
      n_mfcc (int): Number of MFCC features to extract. Default is 39.
      frame_size (float): Length of each frame in seconds. Default is 0.025.
      frame_stride (float): Step between successive frames in seconds. Default is 0.01.
      n_segments (int): Number of segments (n) to pool the frames into.
    
    Returns:
      np.ndarray: A n x 39 array where each row is the average MFCC vector for that segment.
    """
    try:
        signal, sample_rate = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sample_rate)
        hop_length = int(frame_stride * sample_rate)
        
        # Extract MFCC features; result shape is (n_mfcc, T) where T is number of frames.
        mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc,
                                    n_fft=frame_length, hop_length=hop_length)
        
        # Normalize the MFCC features along each coefficient dimension.
        mfcc_normalized = mfcc - np.mean(mfcc, axis=1, keepdims=True)
        
        # Transpose to shape (T, n_mfcc) for pooling along the time axis.
        mfcc_normalized = mfcc_normalized.T
        
        # Divide the frames into n_segments segments and compute the average for each segment.
        segments = np.array_split(mfcc_normalized, n_segments, axis=0)
        pooled_features = np.array([np.mean(seg, axis=0) for seg in segments])
        
        return pooled_features  # Shape: (n_segments, 39)
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return np.array([])

def process_directory_mfcc(directory: str, n_segments: int = 10) -> Dict[str, np.ndarray]:
    """
    Processes all .wav files in the given directory, extracting their MFCC features
    using average pooling to produce an n x 39 feature matrix for each file.
    
    Parameters:
      directory (str): Path to the directory containing .wav files.
      n_segments (int): Number of segments to pool the frames into for each file.
    
    Returns:
      Dict[str, np.ndarray]: A dictionary mapping filenames to their corresponding feature matrices.
    """
    feature_vectors = {}
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            features = extract_mfcc_features(file_path, n_segments=n_segments)
            if features.size > 0:
                feature_vectors[filename] = features
    return feature_vectors


## Classifier on MFCC

In [5]:
import wandb
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


def load_labels(csv_file: str) -> pd.DataFrame:
    return pd.read_csv(csv_file)

def prepare_dataset(features: dict, labels: pd.DataFrame):
    """
    Constructs the dataset by matching each audio file's feature matrix with its label.
    Since each file is represented as an n x 39 matrix (n segments by 39 features),
    we flatten it into a 1D feature vector of length n*39.
    """
    X = []
    y = []
    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            # Flatten the (n, 39) matrix to a 1D vector (n*39,)
            feature_matrix = features[file_id]
            feature_vector = feature_matrix.flatten()
            X.append(feature_vector)
            y.append(int(row['EmotionNumeric']))
    return np.array(X), np.array(y)


def train_and_evaluate(X, y, n):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train GMM Classifier
    gmm_model = GaussianMixture(n_components=len(np.unique(y)), random_state=42)
    gmm_model.fit(X_train)
    gmm_predictions = gmm_model.predict(X_test)

    # Train SVM Classifier
    svm_model = SVC(kernel='linear', random_state=42)
    svm_model.fit(X_train, y_train)
    svm_predictions = svm_model.predict(X_test)

    # Evaluate classifiers
    gmm_accuracy = accuracy_score(y_test, gmm_predictions)
    gmm_precision = precision_score(y_test, gmm_predictions, average='weighted', zero_division=0)
    gmm_recall = recall_score(y_test, gmm_predictions, average='weighted', zero_division=0)
    gmm_f1 = f1_score(y_test, gmm_predictions, average='weighted', zero_division=0)

    svm_accuracy = accuracy_score(y_test, svm_predictions)
    svm_precision = precision_score(y_test, svm_predictions, average='weighted', zero_division=0)
    svm_recall = recall_score(y_test, svm_predictions, average='weighted', zero_division=0)
    svm_f1 = f1_score(y_test, svm_predictions, average='weighted', zero_division=0)

    print("GMM Classifier Report:")
    print(classification_report(y_test, gmm_predictions))

    print("SVM Classifier Report:")
    print(classification_report(y_test, svm_predictions))

    # Log metrics to wandb
    wandb.log({
        "n_segments": n,
        "GMM Accuracy": gmm_accuracy,
        "GMM Precision": gmm_precision,
        "GMM Recall": gmm_recall,
        "GMM F1 Score": gmm_f1,
        "SVM Accuracy": svm_accuracy,
        "SVM Precision": svm_precision,
        "SVM Recall": svm_recall,
        "SVM F1 Score": svm_f1
    })



In [7]:
import wandb

# Initialize wandb
wandb.init(project="emotion-recognition", name="ssp-mfcc-classification")

# Example usage: Varying n
n_values = np.arange(5, 100, 5)  # n_segments from 5 to 20 in steps of 5
labels_csv_path = "EmoDB_dataset/emotion_mapping_detailed.csv"
labels = load_labels(labels_csv_path)

for n in n_values:
    print(f"\nRunning for n_segments = {n}")
    
    # Extract MFCC features with the current n
    mfccFeatures = process_directory_mfcc(directory, n)
    print(f"Number of files processed: {len(mfccFeatures)}")
    
    # Prepare the dataset: each feature matrix is flattened to become a vector
    X, y = prepare_dataset(mfccFeatures, labels)
    print("Dataset shape:", X.shape)
    
    # Train and evaluate classifiers, logging metrics to wandb
    train_and_evaluate(X, y, n)

# Finish wandb run
wandb.finish()


Running for n_segments = 5
Number of files processed: 535
Dataset shape: (535, 195)
GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.21      0.90      0.34        20
           2       0.00      0.00      0.00        12
           3       0.20      0.07      0.11        14
           4       0.20      0.17      0.18        18
           5       0.00      0.00      0.00         9
           6       0.00      0.00      0.00        16

    accuracy                           0.21       107
   macro avg       0.09      0.16      0.09       107
weighted avg       0.10      0.21      0.11       107

SVM Classifier Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.45      0.83      0.59        18
           1       0.71      0.75      0.73        20
           2       0.00      0.00      0.00        12
           3       0.62      0.36      0.45        14
           4       0.71      0.56      0.62        18
           5       0.50      0.44      0.47         9
           6       0.44      0.44      0.44        16

    accuracy                           0.52       107
   macro avg       0.49      0.48      0.47       107
weighted avg       0.52      0.52      0.51       107


Running for n_segments = 10
Number of files processed: 535
Dataset shape: (535, 390)
GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.50      0.06      0.10        18
           1       0.11      0.10      0.10        20
           2       0.25      0.08      0.12        12
           3       0.16      0.64      0.25        14
           4       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Number of files processed: 535
Dataset shape: (535, 2730)
GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.15      0.22      0.18        18
           1       0.00      0.00      0.00        20
           2       0.12      0.33      0.18        12
           3       0.00      0.00      0.00        14
           4       0.12      0.06      0.08        18
           5       0.00      0.00      0.00         9
           6       0.13      0.19      0.15        16

    accuracy                           0.11       107
   macro avg       0.07      0.11      0.08       107
weighted avg       0.08      0.11      0.09       107

SVM Classifier Report:
              precision    recall  f1-score   support

           0       0.46      0.67      0.55        18
           1       0.57      0.65      0.60        20
           2       0.50      0.08      0.14        12
           3       0.29      0.29      0.29        14
           4       0.50     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Number of files processed: 535
Dataset shape: (535, 3120)
GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.08      0.06      0.07        18
           1       0.36      0.25      0.29        20
           2       0.25      0.17      0.20        12
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        18
           5       0.18      0.44      0.26         9
           6       0.21      0.56      0.31        16

    accuracy                           0.20       107
   macro avg       0.15      0.21      0.16       107
weighted avg       0.16      0.20      0.16       107

SVM Classifier Report:
              precision    recall  f1-score   support

           0       0.46      0.67      0.55        18
           1       0.65      0.55      0.59        20
           2       0.00      0.00      0.00        12
           3       0.40      0.29      0.33        14
           4       0.43     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


GMM Classifier Report:
              precision    recall  f1-score   support

           0       0.17      0.17      0.17        18
           1       0.39      0.35      0.37        20
           2       0.18      0.17      0.17        12
           3       0.15      0.29      0.20        14
           4       0.00      0.00      0.00        18
           5       0.00      0.00      0.00         9
           6       0.00      0.00      0.00        16

    accuracy                           0.15       107
   macro avg       0.13      0.14      0.13       107
weighted avg       0.14      0.15      0.14       107

SVM Classifier Report:
              precision    recall  f1-score   support

           0       0.46      0.61      0.52        18
           1       0.60      0.60      0.60        20
           2       0.40      0.17      0.24        12
           3       0.31      0.36      0.33        14
           4       0.43      0.33      0.38        18
           5       0.54      0.7

0,1
GMM Accuracy,█▇██▆▄▄▅▅▁▅▆▄▄▅█▃▁▆
GMM F1 Score,▄▆██▅▄▃▄▄▁▄▆▃▃▂▇▃▂▆
GMM Precision,▂▆▄▆▄▃▂▃█▁▃▃▂▂▂▃▂▅▃
GMM Recall,█▇██▆▄▄▅▅▁▅▆▄▄▅█▃▁▆
SVM Accuracy,▆█▆▃▄▆▄▃▅▁▁▅▂▄▄▅█▃▃
SVM F1 Score,▆█▆▄▄▆▄▃▅▁▁▄▂▄▂▄▇▃▃
SVM Precision,▆█▇▄▃█▄▄▅▁▁▄▂▅▃▃█▃▃
SVM Recall,▆█▆▃▄▆▄▃▅▁▁▅▂▄▄▅█▃▃
n_segments,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██

0,1
GMM Accuracy,0.14953
GMM F1 Score,0.14257
GMM Precision,0.14125
GMM Recall,0.14953
SVM Accuracy,0.4486
SVM F1 Score,0.43512
SVM Precision,0.44223
SVM Recall,0.4486
n_segments,95.0


# RCC and LP Residual

In [8]:
import os
import numpy as np
import pandas as pd
from scipy.signal import lfilter
from scipy.fftpack import dct
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [9]:
import numpy as np
from scipy.fftpack import dct
from scipy.signal import lfilter

def extract_rcc(frame: np.ndarray, order: int = 12, n_rcc: int = 12) -> np.ndarray:
    """
    Extract Residual Cepstral Coefficients (RCC) from a signal frame using LPC and residual signal.
    """
    try:
        # Step 1: LPC Analysis - Calculate the LPC coefficients (Prediction Coefficients)
        autocorr = np.correlate(frame, frame, mode='full')
        autocorr = autocorr[len(autocorr)//2:]  # Keep second half (autocorrelation)
        
        if autocorr[0] == 0:
            return np.zeros(n_rcc)  # Silent frame, return zero vector
        
        # Levinson-Durbin recursion to solve for LPC coefficients
        a = np.zeros(order + 1)
        e = autocorr[0]
        k = np.zeros(order)

        for i in range(order):
            acc = autocorr[i + 1] - np.dot(a[1:i + 1], autocorr[i:0:-1])
            ki = acc / e
            k[i] = ki
            a[1:i+1] -= ki * a[i:0:-1]
            a[i + 1] = ki
            e *= (1 - ki ** 2)

        # Step 2: Compute the residual signal by filtering the frame using LPC coefficients
        residual = lfilter(a, [1.0], frame)
        
        # Step 3: Apply Cepstral Analysis (DCT) to the residual signal
        # We use the first n_rcc coefficients from the DCT of the log of the residual power spectrum
        residual_power_spectrum = np.abs(np.fft.fft(residual)) ** 2
        log_residual_spectrum = np.log(residual_power_spectrum + 1e-8)  # Log power spectrum

        # Compute the DCT (Discrete Cosine Transform)
        rcc = dct(log_residual_spectrum, type=2)[:n_rcc]
        
        return rcc
    
    except Exception as e:
        print(f"Error extracting RCC: {e}")
        return np.zeros(n_rcc)  # Return zero vector in case of error


In [10]:
import numpy as np
import librosa

def compute_lp_residual_energy(frame: np.ndarray, order: int = 12) -> float:
    """
    Compute the Linear Prediction (LP) residual energy of a signal frame using librosa.

    Parameters:
    frame (np.ndarray): The input frame of the signal.
    order (int): The order of the LPC analysis (default is 12).

    Returns:
    float: The energy of the LP residual signal.
    """
    try:
        # Step 1: LPC Analysis using librosa to compute LPC coefficients
        a = librosa.lpc(frame, order=order)  # LPC coefficients (a[0] is the gain)

        # Step 2: Compute the residual signal by filtering the frame using LPC coefficients
        residual = lfilter(a, [1.0], frame)

        # Step 3: Compute the energy of the residual signal (sum of squared values)
        residual_energy = np.sum(residual ** 2)

        return residual_energy
    
    except Exception as e:
        print(f"Error computing LP residual energy: {e}")
        return 0.0  # Return zero in case of error


In [11]:
import numpy as np
import librosa
from scipy.signal import lfilter
from scipy.fftpack import dct

def extract_rcc_lp_features(file_path: str,
                            frame_size: float = 0.025,
                            frame_stride: float = 0.01,
                            target_rcc_segments: int = 100,
                            target_lp_segments: int = 100,
                            rcc_order: int = 12) -> tuple[np.ndarray, np.ndarray]:
    """
    Extracts RCC and LP residual energy framewise then condenses the features 
    into fixed-length feature matrices via average pooling.

    Parameters:
        file_path (str): Path to the audio file.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_rcc_segments (int): Desired number of pooled segments for RCC features.
        target_lp_segments (int): Desired number of pooled segments for LP residual features.
        rcc_order (int): Number of RCC coefficients to extract per frame.
    
    Returns:
        tuple:
            np.ndarray: Pooled RCC features with shape (target_rcc_segments, rcc_order).
            np.ndarray: Pooled LP residual energies with shape (target_lp_segments, 1).
    """
    try:
        signal, sr = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sr)
        hop_length = int(frame_stride * sr)
        
        # Frame the signal: shape (number_of_frames, frame_length)
        frames = librosa.util.frame(signal, frame_length=frame_length, hop_length=hop_length).T

        rcc_list = []
        lp_list = []
        
        for frame in frames:
            # Apply windowing
            frame = frame * np.hamming(len(frame))
            
            # RCC extraction: use LPC analysis then DCT of the log power spectrum
            # (using the existing extract_rcc logic)
            try:
                # Compute autocorrelation for LPC
                autocorr = np.correlate(frame, frame, mode='full')
                autocorr = autocorr[len(autocorr)//2:]
                if autocorr[0] == 0:
                    rcc = np.zeros(rcc_order)
                else:
                    a = np.zeros(rcc_order + 1)
                    e = autocorr[0]
                    for i in range(rcc_order):
                        acc = autocorr[i + 1] - np.dot(a[1:i+1], autocorr[i:0:-1])
                        ki = acc / e
                        a[1:i+1] -= ki * a[i:0:-1]
                        a[i + 1] = ki
                        e *= (1 - ki ** 2)
                    # Residual signal
                    residual = lfilter(a, [1.0], frame)
                    # Compute RCC using DCT
                    residual_power_spectrum = np.abs(np.fft.fft(residual)) ** 2
                    log_residual_spectrum = np.log(residual_power_spectrum + 1e-8)
                    rcc = dct(log_residual_spectrum, type=2)[:rcc_order]
            except Exception as ex:
                print(f"Error in RCC extraction for frame: {ex}")
                rcc = np.zeros(rcc_order)
            
            # LP residual energy extraction: using librosa.lpc for LPC coefficients
            try:
                a_lp = librosa.lpc(frame, order=rcc_order) 
                residual_lp = lfilter(a_lp, [1.0], frame)
                lp_energy = np.sum(residual_lp ** 2)
            except Exception as ex:
                print(f"Error in LP energy computation for frame: {ex}")
                lp_energy = 0.0
            
            rcc_list.append(rcc)
            lp_list.append([lp_energy])  # Keep as list for 2D array

        rcc_array = np.array(rcc_list)  # shape: (num_frames, rcc_order)
        lp_array = np.array(lp_list)      # shape: (num_frames, 1)
        
        # Average pool features to desired length for each feature type
        # For RCC features
        rcc_segments = np.array_split(rcc_array, target_rcc_segments, axis=0)
        rcc_pooled = np.array([np.mean(seg, axis=0) for seg in rcc_segments])
        
        # For LP residual energy features
        lp_segments = np.array_split(lp_array, target_lp_segments, axis=0)
        lp_pooled = np.array([np.mean(seg, axis=0) for seg in lp_segments])
        
        return rcc_pooled, lp_pooled
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return np.zeros((target_rcc_segments, rcc_order)), np.zeros((target_lp_segments, 1))

In [12]:
def process_directory_rcc_lp(directory: str,
                             frame_size: float = 0.025,
                             frame_stride: float = 0.01,
                             target_rcc_segments: int = 100,
                             target_lp_segments: int = 100,
                             rcc_order: int = 12) -> dict:
    """
    Processes all .wav files in the directory and extracts RCC and LP residual features.

    Parameters:
        directory (str): Path to the directory containing .wav files.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_rcc_segments (int): Desired number of pooled segments for RCC features.
        target_lp_segments (int): Desired number of pooled segments for LP residual features.
        rcc_order (int): Number of RCC coefficients to extract per frame.

    Returns:
        dict: Mapping of filename to {"RCC": <flattened RCC vector>,
                                      "LP": <flattened LP vector>,
                                      "Full": <concatenated RCC and LP features>}.
    """
    import os
    import numpy as np

    feature_vectors = {}

    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            rcc, lp = extract_rcc_lp_features(
                file_path,
                frame_size=frame_size,
                frame_stride=frame_stride,
                target_rcc_segments=target_rcc_segments,
                target_lp_segments=target_lp_segments,
                rcc_order=rcc_order
            )
            rcc_flat = rcc.flatten()  # Shape: (target_rcc_segments * rcc_order,)
            lp_flat = lp.flatten()    # Shape: (target_lp_segments,)

            feature_vectors[filename] = {
                "RCC": rcc_flat,
                "LP": lp_flat,
                "Full": np.concatenate([rcc_flat, lp_flat])
            }

    return feature_vectors

In [13]:
def prepare_dataset(features: dict, labels: pd.DataFrame):
    """
    Prepares RCC, LP, and combined feature sets for classification.

    Returns:
        X_full (np.ndarray): RCC + LP.
        X_rcc (np.ndarray): RCC only.
        X_lp (np.ndarray): LP only.
        y (np.ndarray): Emotion labels.
    """
    X_full, X_rcc, X_lp, y = [], [], [], []

    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            feat = features[file_id]
            X_full.append(feat["Full"])
            X_rcc.append(feat["RCC"])
            X_lp.append(feat["LP"])
            y.append(int(row['EmotionNumeric']))

    return (
        np.array(X_full),
        np.array(X_rcc),
        np.array(X_lp),
        np.array(y)
    )

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

def train_and_evaluate(X, y):
    """
    Train and evaluate multiple classifiers: SVM, Random Forest, XGBoost, Logistic Regression, KNN.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifiers = {
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=20000, random_state=42),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
    }

    for name, clf in classifiers.items():
        print(f"\nTraining and evaluating: {name}")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(classification_report(y_test, y_pred, zero_division=0))


In [15]:
# Paths
directory = "./EmoDB_dataset/wav"  # Replace with your path
labels_csv_path = "./EmoDB_dataset/emotion_mapping_detailed.csv"  # Replace with your CSV path

# Load label CSV
labels = pd.read_csv(labels_csv_path)

# Extract RCC and LP features
features = process_directory_rcc_lp(directory, target_rcc_segments = 10, target_lp_segments = 120)

# Prepare feature sets
X_full, X_rcc, X_lp, y = prepare_dataset(features, labels)

# Check dataset shapes
print("Full (RCC + LP) shape:", X_full.shape)
print("RCC-only shape:", X_rcc.shape)
print("LP-only shape:", X_lp.shape)


Full (RCC + LP) shape: (535, 240)
RCC-only shape: (535, 120)
LP-only shape: (535, 120)


In [16]:
print("\n--- RCC + LP Residual ---")
train_and_evaluate(X_full, y)


--- RCC + LP Residual ---

Training and evaluating: SVM (Linear Kernel)
              precision    recall  f1-score   support

           0       0.63      0.67      0.65        18
           1       0.67      0.50      0.57        20
           2       0.43      0.25      0.32        12
           3       0.47      0.64      0.55        14
           4       0.59      0.56      0.57        18
           5       0.56      0.56      0.56         9
           6       0.57      0.75      0.65        16

    accuracy                           0.57       107
   macro avg       0.56      0.56      0.55       107
weighted avg       0.57      0.57      0.56       107


Training and evaluating: Random Forest
              precision    recall  f1-score   support

           0       0.55      0.89      0.68        18
           1       0.80      0.60      0.69        20
           2       0.00      0.00      0.00        12
           3       0.50      0.43      0.46        14
           4       

In [17]:
print("\n--- RCC Only ---")
train_and_evaluate(X_rcc, y)


--- RCC Only ---

Training and evaluating: SVM (Linear Kernel)
              precision    recall  f1-score   support

           0       0.63      0.67      0.65        18
           1       0.67      0.50      0.57        20
           2       0.43      0.25      0.32        12
           3       0.47      0.64      0.55        14
           4       0.59      0.56      0.57        18
           5       0.56      0.56      0.56         9
           6       0.57      0.75      0.65        16

    accuracy                           0.57       107
   macro avg       0.56      0.56      0.55       107
weighted avg       0.57      0.57      0.56       107


Training and evaluating: Random Forest
              precision    recall  f1-score   support

           0       0.50      0.89      0.64        18
           1       0.59      0.65      0.62        20
           2       0.50      0.08      0.14        12
           3       0.45      0.36      0.40        14
           4       0.75     

In [18]:

print("\n--- LP Residual Only ---")
train_and_evaluate(X_lp, y)


--- LP Residual Only ---

Training and evaluating: SVM (Linear Kernel)
              precision    recall  f1-score   support

           0       0.29      0.61      0.39        18
           1       0.50      0.15      0.23        20
           2       0.14      0.08      0.11        12
           3       0.20      0.07      0.11        14
           4       0.75      0.17      0.27        18
           5       0.29      0.22      0.25         9
           6       0.30      0.75      0.43        16

    accuracy                           0.31       107
   macro avg       0.35      0.29      0.26       107
weighted avg       0.38      0.31      0.27       107


Training and evaluating: Random Forest
              precision    recall  f1-score   support

           0       0.48      0.89      0.63        18
           1       0.69      0.55      0.61        20
           2       0.00      0.00      0.00        12
           3       0.50      0.71      0.59        14
           4       0

In [22]:
import os
import numpy as np
import pandas as pd
import librosa
import scipy.signal
from typing import Dict

def extract_gvv_features(file_path: str, frame_size: float = 0.025, frame_stride: float = 0.01, n_segments: int = 10) -> np.ndarray:
    """
    Extracts GVV features from an audio file using a simplified IAIF approach.
    The features are averaged over time into an n x 1 feature matrix.

    Returns:
      np.ndarray: A (n_segments, 1) array of GVV energy values.
    """
    try:
        signal, sample_rate = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sample_rate)
        hop_length = int(frame_stride * sample_rate)

        # Pre-emphasis
        pre_emphasis = 0.97
        emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

        # LPC Analysis
        lpc_order = 16
        lpc_coeffs = librosa.lpc(emphasized_signal, order=lpc_order)

        # Inverse filtering (glottal excitation)
        glottal_source = scipy.signal.lfilter(lpc_coeffs, [1.0], emphasized_signal)

        # Frame the glottal source signal
        frames = librosa.util.frame(glottal_source, frame_length=frame_length, hop_length=hop_length).T

        # Energy per frame
        frame_energies = np.sum(frames ** 2, axis=1)

        # Normalize energies
        frame_energies -= np.mean(frame_energies)
        frame_energies /= (np.std(frame_energies) + 1e-6)

        # Segment pooling
        segments = np.array_split(frame_energies, n_segments)
        pooled_features = np.array([np.mean(seg) for seg in segments]).reshape(-1, 1)

        return pooled_features

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return np.array([])

In [23]:
def process_directory_gvv(directory: str, n_segments: int = 10) -> Dict[str, np.ndarray]:
    """
    Extracts GVV features from all .wav files in the directory.
    """
    feature_vectors = {}
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            features = extract_gvv_features(file_path, n_segments=n_segments)
            if features.size > 0:
                feature_vectors[filename] = features.flatten()  # Flatten for classifier input
    return feature_vectors

In [28]:
# Paths
directory = "./EmoDB_dataset/wav"  # Replace with your .wav directory
labels_csv_path = "./EmoDB_dataset/emotion_mapping_detailed.csv"  # Replace with your CSV

# Load emotion labels
labels_df = pd.read_csv(labels_csv_path)  # Assumes columns: 'filename', 'emotion'

# Extract GVV features
features_dict = process_directory_gvv(directory)

# Align features with labels
X, y = [], []

for _, row in labels_df.iterrows():
    fname = row['Filename']
    if fname in features_dict:
        X.append(features_dict[fname])
        y.append(row['EmotionNumeric'])  # Adjust if label is in another column

X = np.array(X)
y = np.array(y)

In [None]:
def train_and_evaluate(X, y):
    """
    Train and evaluate multiple classifiers: SVM, Random Forest, XGBoost, Logistic Regression, KNN.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifiers = {
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=20000, random_state=42),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=4)
    }

    for name, clf in classifiers.items():
        print(f"\nTraining and evaluating: {name}")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(classification_report(y_test, y_pred, zero_division=0))

In [33]:
print("\n--- GVV ---")
train_and_evaluate(X, y)


--- GVV ---

Training and evaluating: SVM (Linear Kernel)
              precision    recall  f1-score   support

           0       0.27      0.89      0.41        18
           1       0.38      0.45      0.41        20
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        18
           5       0.62      0.56      0.59         9
           6       0.29      0.25      0.27        16

    accuracy                           0.32       107
   macro avg       0.22      0.31      0.24       107
weighted avg       0.21      0.32      0.23       107


Training and evaluating: Random Forest
              precision    recall  f1-score   support

           0       0.38      0.89      0.53        18
           1       0.50      0.35      0.41        20
           2       0.00      0.00      0.00        12
           3       0.18      0.14      0.16        14
           4       0.33      0.17

In [38]:
def train_and_evaluate(X, y):
    """
    Train and evaluate multiple classifiers: SVM, Random Forest, XGBoost, Logistic Regression, KNN.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifiers = {
        "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=25000, random_state=1331),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
    }

    for name, clf in classifiers.items():
        print(f"\nTraining and evaluating: {name}")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(classification_report(y_test, y_pred, zero_division=0))

In [39]:
import os
import numpy as np
import pandas as pd

# Load features from each method.
mfcc_features = process_directory_mfcc(directory, n_segments=85)
rcc_lp_features = process_directory_rcc_lp(directory, target_rcc_segments=10, target_lp_segments=120, rcc_order=12)
gvv_features = process_directory_gvv(directory, n_segments=10)

# Combine features by appending (concatenation)
combined_features = {}
for filename in os.listdir(directory):
    if filename.endswith('.wav'):
        # Check that all features exist for this file
        if filename in mfcc_features and filename in rcc_lp_features and filename in gvv_features:
            mfcc_vec = mfcc_features[filename].flatten()         # e.g., shape (85*39,)
            rcc_lp_vec = rcc_lp_features[filename]["Full"]         # already flattened concatenation of RCC and LP
            gvv_vec = gvv_features[filename].flatten()             # e.g., shape (10,)
            combined = np.concatenate([mfcc_vec, rcc_lp_vec, gvv_vec])
            combined_features[filename] = combined

# Prepare dataset using the combined feature vectors.
def prepare_combined_dataset(features: dict, labels: pd.DataFrame):
    X, y = [], []
    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            X.append(features[file_id])
            y.append(int(row['EmotionNumeric']))
    return np.array(X), np.array(y)

labels = pd.read_csv("EmoDB_dataset/emotion_mapping_detailed.csv")
X_combined, y_combined = prepare_combined_dataset(combined_features, labels)
print("Combined dataset shape:", X_combined.shape)

# Train and evaluate classifier on the combined feature set
train_and_evaluate(X_combined, y_combined)

Combined dataset shape: (535, 3565)

Training and evaluating: SVM (Linear Kernel)
              precision    recall  f1-score   support

           0       0.74      0.78      0.76        18
           1       0.72      0.65      0.68        20
           2       0.57      0.33      0.42        12
           3       0.53      0.64      0.58        14
           4       0.69      0.61      0.65        18
           5       0.55      0.67      0.60         9
           6       0.63      0.75      0.69        16

    accuracy                           0.64       107
   macro avg       0.63      0.63      0.63       107
weighted avg       0.65      0.64      0.64       107


Training and evaluating: Random Forest
              precision    recall  f1-score   support

           0       0.42      1.00      0.59        18
           1       0.47      0.45      0.46        20
           2       0.00      0.00      0.00        12
           3       0.56      0.36      0.43        14
          