In [None]:
directory = './EmoDB_dataset/wav'
labels_csv_path = "./EmoDB_dataset/emotion_mapping_detailed.csv"  # Replace with your CSV path

# Train and evaluate function

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import numpy as np
import os

def train_and_evaluate(X, y):
    """
    Train and evaluate multiple classifiers and return metrics dictionary.
    
    Parameters:
        X (np.ndarray): Feature matrix
        y (np.ndarray): Target labels
        
    Returns:
        dict: Dictionary containing metrics for each classifier
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifiers = {
        "SVM": SVC(kernel='linear', random_state=42),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(eval_metric='mlogloss', random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=20000, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5)
    }
    
    metrics = {}
    
    for name, clf in classifiers.items():
        print(f"\nTraining and evaluating: {name}")
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        # Store metrics in dictionary
        metrics[f"{name}_Accuracy"] = accuracy
        metrics[f"{name}_Precision"] = precision
        metrics[f"{name}_Recall"] = recall
        metrics[f"{name}_F1"] = f1
        
        # Print classification report for reference
        print(classification_report(y_test, y_pred, zero_division=0))
        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    
    return metrics

# MFCC features

## Feature extraction

In [18]:
import os
import librosa
import numpy as np
from typing import Dict

def extract_mfcc_features(file_path: str, n_mfcc: int = 39, 
                          frame_size: float = 0.025, frame_stride: float = 0.01, 
                          n_segments: int = 10) -> np.ndarray:
    """
    Extracts 39 MFCC features framewise from an audio file and then applies
    average pooling to condense the features over time into an n x 39 feature matrix.
    
    Parameters:
      file_path (str): Path to the audio file.
      n_mfcc (int): Number of MFCC features to extract. Default is 39.
      frame_size (float): Length of each frame in seconds. Default is 0.025.
      frame_stride (float): Step between successive frames in seconds. Default is 0.01.
      n_segments (int): Number of segments (n) to pool the frames into.
    
    Returns:
      np.ndarray: A n x 39 array where each row is the average MFCC vector for that segment.
    """
    try:
        signal, sample_rate = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sample_rate)
        hop_length = int(frame_stride * sample_rate)
        
        # Extract MFCC features; result shape is (n_mfcc, T) where T is number of frames.
        mfcc = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=n_mfcc,
                                    n_fft=frame_length, hop_length=hop_length)
        
        # Normalize the MFCC features along each coefficient dimension.
        mfcc_normalized = mfcc - np.mean(mfcc, axis=1, keepdims=True)
        
        # Transpose to shape (T, n_mfcc) for pooling along the time axis.
        mfcc_normalized = mfcc_normalized.T
        
        # Divide the frames into n_segments segments and compute the average for each segment.
        segments = np.array_split(mfcc_normalized, n_segments, axis=0)
        pooled_features = np.array([np.mean(seg, axis=0) for seg in segments])
        
        return pooled_features  # Shape: (n_segments, 39)
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return np.array([])

def process_directory_mfcc(directory: str, n_segments: int = 10) -> Dict[str, np.ndarray]:
    """
    Processes all .wav files in the given directory, extracting their MFCC features
    using average pooling to produce an n x 39 feature matrix for each file.
    
    Parameters:
      directory (str): Path to the directory containing .wav files.
      n_segments (int): Number of segments to pool the frames into for each file.
    
    Returns:
      Dict[str, np.ndarray]: A dictionary mapping filenames to their corresponding feature matrices.
    """
    feature_vectors = {}
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            features = extract_mfcc_features(file_path, n_segments=n_segments)
            if features.size > 0:
                feature_vectors[filename] = features
    return feature_vectors


## Classifier on MFCC

In [19]:
import wandb
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


def load_labels(csv_file: str) -> pd.DataFrame:
    return pd.read_csv(csv_file)

def prepare_dataset_mfcc(features: dict, labels: pd.DataFrame):
    """
    Constructs the dataset by matching each audio file's feature matrix with its label.
    Since each file is represented as an n x 39 matrix (n segments by 39 features),
    we flatten it into a 1D feature vector of length n*39.
    """
    X = []
    y = []
    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            # Flatten the (n, 39) matrix to a 1D vector (n*39,)
            feature_matrix = features[file_id]
            feature_vector = feature_matrix.flatten()
            X.append(feature_vector)
            y.append(int(row['EmotionNumeric']))
    return np.array(X), np.array(y)


In [None]:
import wandb

# Initialize wandb
wandb.init(project="emotion-recognition", name="ssp-mfcc-classification")

# Example usage: Varying n
n_values = np.arange(5, 110, 5)  # n_segments from 5 to 100 in steps of 5
labels = load_labels(labels_csv_path)

for n in n_values:
    print(f"\nRunning for n_segments = {n}")
    
    # Extract MFCC features with the current n
    mfccFeatures = process_directory_mfcc(directory, n)
    print(f"Number of files processed: {len(mfccFeatures)}")
    
    # Prepare the dataset: each feature matrix is flattened to become a vector
    X, y = prepare_dataset_mfcc(mfccFeatures, labels)
    print("Dataset shape:", X.shape)
    
    # Train and evaluate classifiers, logging metrics to wandb
    metrics = train_and_evaluate(X, y)
    
    # Log metrics to wandb with the 'n' value
    log_data = {
        "n_segments": n,
        "feature_type": "MFCC",
        "feature_dim": X.shape[1],
        **metrics
    }
    
    wandb.log(log_data)
    
    # Optional: You could also create a summary table for easy comparison
    wandb.run.summary[f"MFCC_n{n}_best_accuracy"] = max([
        metrics.get("SVM_Accuracy", 0),
        metrics.get("RandomForest_Accuracy", 0),
        metrics.get("XGBoost_Accuracy", 0),
        metrics.get("LogisticRegression_Accuracy", 0),
        metrics.get("KNN_Accuracy", 0)
    ])

# Finish wandb run
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
KNN_Accuracy,▁
KNN_F1,▁
KNN_Precision,▁
KNN_Recall,▁
LogisticRegression_Accuracy,▁
LogisticRegression_F1,▁
LogisticRegression_Precision,▁
LogisticRegression_Recall,▁
RandomForest_Accuracy,▁
RandomForest_F1,▁

0,1
KNN_Accuracy,0.43925
KNN_F1,0.41414
KNN_Precision,0.43893
KNN_Recall,0.43925
LogisticRegression_Accuracy,0.53271
LogisticRegression_F1,0.53488
LogisticRegression_Precision,0.5485
LogisticRegression_Recall,0.53271
MFCC_n5_best_accuracy,0.53271
RandomForest_Accuracy,0.43925



Running for n_segments = 5
Number of files processed: 535
Dataset shape: (535, 195)

Training and evaluating: SVM
              precision    recall  f1-score   support

           0       0.45      0.83      0.59        18
           1       0.71      0.75      0.73        20
           2       0.00      0.00      0.00        12
           3       0.62      0.36      0.45        14
           4       0.71      0.56      0.62        18
           5       0.50      0.44      0.47         9
           6       0.44      0.44      0.44        16

    accuracy                           0.52       107
   macro avg       0.49      0.48      0.47       107
weighted avg       0.52      0.52      0.51       107

Accuracy: 0.5234, Precision: 0.5194, Recall: 0.5234, F1: 0.5053

Training and evaluating: RandomForest
              precision    recall  f1-score   support

           0       0.29      0.78      0.42        18
           1       0.61      0.70      0.65        20
           2       0.0

0,1
KNN_Accuracy,█▁▇
KNN_F1,█▁▆
KNN_Precision,█▁█
KNN_Recall,█▁▇
LogisticRegression_Accuracy,█▇▁
LogisticRegression_F1,█▇▁
LogisticRegression_Precision,█▇▁
LogisticRegression_Recall,█▇▁
RandomForest_Accuracy,▁█▄
RandomForest_F1,▁█▇

0,1
KNN_Accuracy,0.42991
KNN_F1,0.39549
KNN_Precision,0.44483
KNN_Recall,0.42991
LogisticRegression_Accuracy,0.4486
LogisticRegression_F1,0.45049
LogisticRegression_Precision,0.48552
LogisticRegression_Recall,0.4486
MFCC_n10_best_accuracy,0.56075
MFCC_n15_best_accuracy,0.51402


# RCC

In [22]:
import os
import numpy as np
import pandas as pd
from scipy.signal import lfilter
from scipy.fftpack import dct
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [23]:
import numpy as np
from scipy.fftpack import dct
from scipy.signal import lfilter

def extract_rcc(frame: np.ndarray, order: int = 12, n_rcc: int = 12) -> np.ndarray:
    """
    Extract Residual Cepstral Coefficients (RCC) from a signal frame using LPC and residual signal.
    """
    try:
        # Step 1: LPC Analysis - Calculate the LPC coefficients (Prediction Coefficients)
        autocorr = np.correlate(frame, frame, mode='full')
        autocorr = autocorr[len(autocorr)//2:]  # Keep second half (autocorrelation)
        
        if autocorr[0] == 0:
            return np.zeros(n_rcc)  # Silent frame, return zero vector
        
        # Levinson-Durbin recursion to solve for LPC coefficients
        a = np.zeros(order + 1)
        e = autocorr[0]
        k = np.zeros(order)

        for i in range(order):
            acc = autocorr[i + 1] - np.dot(a[1:i + 1], autocorr[i:0:-1])
            ki = acc / e
            k[i] = ki
            a[1:i+1] -= ki * a[i:0:-1]
            a[i + 1] = ki
            e *= (1 - ki ** 2)

        # Step 2: Compute the residual signal by filtering the frame using LPC coefficients
        residual = lfilter(a, [1.0], frame)
        
        # Step 3: Apply Cepstral Analysis (DCT) to the residual signal
        # We use the first n_rcc coefficients from the DCT of the log of the residual power spectrum
        residual_power_spectrum = np.abs(np.fft.fft(residual)) ** 2
        log_residual_spectrum = np.log(residual_power_spectrum + 1e-8)  # Log power spectrum

        # Compute the DCT (Discrete Cosine Transform)
        rcc = dct(log_residual_spectrum, type=2)[:n_rcc]
        
        return rcc
    
    except Exception as e:
        print(f"Error extracting RCC: {e}")
        return np.zeros(n_rcc)  # Return zero vector in case of error


In [24]:
import numpy as np
import librosa
from scipy.signal import lfilter
from scipy.fftpack import dct

def extract_rcc_features(file_path: str,
                         frame_size: float = 0.025,
                         frame_stride: float = 0.01,
                         target_rcc_segments: int = 100,
                         rcc_order: int = 12) -> np.ndarray:
    """
    Extracts Residual Cepstral Coefficients (RCC) features framewise then condenses 
    the features into a fixed-length feature matrix via average pooling.

    Parameters:
        file_path (str): Path to the audio file.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_rcc_segments (int): Desired number of pooled segments for RCC features.
        rcc_order (int): Number of RCC coefficients to extract per frame.
    
    Returns:
        np.ndarray: Pooled RCC features with shape (target_rcc_segments, rcc_order).
    """
    try:
        signal, sr = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sr)
        hop_length = int(frame_stride * sr)
        
        # Frame the signal: shape (number_of_frames, frame_length)
        frames = librosa.util.frame(signal, frame_length=frame_length, hop_length=hop_length).T

        rcc_list = []
        
        for frame in frames:
            # Apply windowing
            frame = frame * np.hamming(len(frame))
            
            # RCC extraction: use LPC analysis then DCT of the log power spectrum
            try:
                # Compute autocorrelation for LPC
                autocorr = np.correlate(frame, frame, mode='full')
                autocorr = autocorr[len(autocorr)//2:]
                if autocorr[0] == 0:
                    rcc = np.zeros(rcc_order)
                else:
                    a = np.zeros(rcc_order + 1)
                    e = autocorr[0]
                    for i in range(rcc_order):
                        acc = autocorr[i + 1] - np.dot(a[1:i+1], autocorr[i:0:-1])
                        ki = acc / e
                        a[1:i+1] -= ki * a[i:0:-1]
                        a[i + 1] = ki
                        e *= (1 - ki ** 2)
                    # Residual signal
                    residual = lfilter(a, [1.0], frame)
                    # Compute RCC using DCT
                    residual_power_spectrum = np.abs(np.fft.fft(residual)) ** 2
                    log_residual_spectrum = np.log(residual_power_spectrum + 1e-8)
                    rcc = dct(log_residual_spectrum, type=2)[:rcc_order]
            except Exception as ex:
                print(f"Error in RCC extraction for frame: {ex}")
                rcc = np.zeros(rcc_order)
            
            rcc_list.append(rcc)

        rcc_array = np.array(rcc_list)  # shape: (num_frames, rcc_order)
        
        # Average pool features to desired length
        rcc_segments = np.array_split(rcc_array, target_rcc_segments, axis=0)
        rcc_pooled = np.array([np.mean(seg, axis=0) for seg in rcc_segments])
        
        return rcc_pooled
    
    except Exception as e:
        print(f"Error processing {file_path} for RCC: {e}")
        return np.zeros((target_rcc_segments, rcc_order))

In [25]:
import os
import numpy as np
from typing import Dict

def process_directory_rcc(directory: str,
                          frame_size: float = 0.025,
                          frame_stride: float = 0.01,
                          target_rcc_segments: int = 100,
                          rcc_order: int = 12) -> Dict[str, np.ndarray]:
    """
    Processes all .wav files in the directory and extracts RCC features.

    Parameters:
        directory (str): Path to the directory containing .wav files.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_rcc_segments (int): Desired number of pooled segments for RCC features.
        rcc_order (int): Number of RCC coefficients to extract per frame.

    Returns:
        dict: Mapping of filename to flattened RCC feature vectors.
    """
    feature_vectors = {}

    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                rcc_features = extract_rcc_features(
                    file_path,
                    frame_size=frame_size,
                    frame_stride=frame_stride,
                    target_rcc_segments=target_rcc_segments,
                    rcc_order=rcc_order
                )
                rcc_flat = rcc_features.flatten()  # Shape: (target_rcc_segments * rcc_order,)
                feature_vectors[filename] = rcc_flat
            except Exception as e:
                print(f"Error processing RCC for {file_path}: {e}")

    return feature_vectors


In [26]:
def prepare_rcc_dataset(features: dict, labels: pd.DataFrame):
    """
    Prepares RCC feature set for classification.

    Parameters:
        features (dict): Dictionary mapping filenames to RCC feature vectors.
        labels (pd.DataFrame): DataFrame containing emotion labels.

    Returns:
        tuple:
            np.ndarray: RCC features.
            np.ndarray: Emotion labels.
    """
    X_rcc, y = [], []

    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            # Direct access to feature vector (no nested dictionary)
            X_rcc.append(features[file_id])
            y.append(int(row['EmotionNumeric']))

    return np.array(X_rcc), np.array(y)

In [None]:
import wandb

# Initialize wandb
wandb.init(project="emotion-recognition", name="ssp-rcc-classification")

# Experiment with different segment values for RCC features
rcc_segment_values = np.arange(5, 16, 2)  # Try different segment values from 5 to 45
labels_csv_path = "./EmoDB_dataset/emotion_mapping_detailed.csv"
labels = pd.read_csv(labels_csv_path)

for n in rcc_segment_values:
    print(f"\nRunning for target_rcc_segments = {n}")
    
    # Extract RCC features with the current number of segments
    features = process_directory_rcc(directory, target_rcc_segments=n)
    print(f"Number of files processed: {len(features)}")
    
    # Prepare feature sets
    X_rcc, y_rcc = prepare_rcc_dataset(features, labels)
    print("RCC-only shape:", X_rcc.shape)
    
    # Train and evaluate classifiers, logging metrics to wandb
    print(f"\n--- RCC with {n} segments ---")
    metrics = train_and_evaluate(X_rcc, y_rcc)
    
    # Log metrics to wandb with current segment value
    log_data = {
        "rcc_segments": n,
        "feature_type": "RCC",
        "feature_dim": X_rcc.shape[1],
        **metrics
    }
    
    wandb.log(log_data)
    
    # Add summary for easy comparison
    wandb.run.summary[f"RCC_n{n}_best_accuracy"] = max([
        metrics.get("SVM_Accuracy", 0),
        metrics.get("RandomForest_Accuracy", 0),
        metrics.get("XGBoost_Accuracy", 0),
        metrics.get("LogisticRegression_Accuracy", 0),
        metrics.get("KNN_Accuracy", 0)
    ])

# Finish wandb run
wandb.finish()


Running for target_rcc_segments = 5
Number of files processed: 535
RCC-only shape: (535, 60)

--- RCC with 5 segments ---

Training and evaluating: SVM
              precision    recall  f1-score   support

           0       0.61      0.61      0.61        18
           1       0.56      0.50      0.53        20
           2       0.23      0.25      0.24        12
           3       0.47      0.50      0.48        14
           4       0.62      0.56      0.59        18
           5       0.75      0.67      0.71         9
           6       0.37      0.44      0.40        16

    accuracy                           0.50       107
   macro avg       0.52      0.50      0.51       107
weighted avg       0.52      0.50      0.51       107

Accuracy: 0.5047, Precision: 0.5169, Recall: 0.5047, F1: 0.5094

Training and evaluating: RandomForest
              precision    recall  f1-score   support

           0       0.52      0.83      0.64        18
           1       0.48      0.50     

STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.68      0.72      0.70        18
           1       0.50      0.45      0.47        20
           2       0.25      0.17      0.20        12
           3       0.50      0.57      0.53        14
           4       0.63      0.67      0.65        18
           5       0.80      0.89      0.84         9
           6       0.41      0.44      0.42        16

    accuracy                           0.55       107
   macro avg       0.54      0.56      0.55       107
weighted avg       0.54      0.55      0.54       107

Accuracy: 0.5514, Precision: 0.5371, Recall: 0.5514, F1: 0.5424

Training and evaluating: KNN
              precision    recall  f1-score   support

           0       0.38      0.83      0.53        18
           1       0.48      0.70      0.57        20
           2       0.00      0.00      0.00        12
           3       0.40      0.29      0.33        14
           4       0.29      0.11     

STOP: TOTAL NO. OF F,G EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.75      0.50      0.60        18
           1       0.56      0.45      0.50        20
           2       0.18      0.17      0.17        12
           3       0.32      0.43      0.36        14
           4       0.50      0.44      0.47        18
           5       0.64      0.78      0.70         9
           6       0.50      0.69      0.58        16

    accuracy                           0.49       107
   macro avg       0.49      0.49      0.48       107
weighted avg       0.51      0.49      0.49       107

Accuracy: 0.4860, Precision: 0.5054, Recall: 0.4860, F1: 0.4861

Training and evaluating: KNN
              precision    recall  f1-score   support

           0       0.42      0.78      0.55        18
           1       0.35      0.55      0.43        20
           2       0.40      0.17      0.24        12
           3       0.57      0.29      0.38        14
           4       0.33      0.11     

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


              precision    recall  f1-score   support

           0       0.58      0.61      0.59        18
           1       0.64      0.35      0.45        20
           2       0.12      0.08      0.10        12
           3       0.40      0.43      0.41        14
           4       0.50      0.56      0.53        18
           5       0.64      0.78      0.70         9
           6       0.39      0.56      0.46        16

    accuracy                           0.48       107
   macro avg       0.47      0.48      0.46       107
weighted avg       0.48      0.48      0.47       107

Accuracy: 0.4766, Precision: 0.4788, Recall: 0.4766, F1: 0.4662

Training and evaluating: KNN
              precision    recall  f1-score   support

           0       0.48      0.72      0.58        18
           1       0.38      0.50      0.43        20
           2       0.00      0.00      0.00        12
           3       0.38      0.21      0.27        14
           4       0.57      0.22     

0,1
KNN_Accuracy,▁▁▇▇▁█
KNN_F1,▁▄██▃█
KNN_Precision,▁▅█▇▅▅
KNN_Recall,▁▁▇▇▁█
LogisticRegression_Accuracy,█▅▁▂▂▄
LogisticRegression_F1,█▅▁▂▁▄
LogisticRegression_Precision,█▆▁▂▂▄
LogisticRegression_Recall,█▅▁▂▂▄
RandomForest_Accuracy,█▇▅▄▃▁
RandomForest_F1,█▆▄▃▃▁

0,1
KNN_Accuracy,0.4486
KNN_F1,0.40238
KNN_Precision,0.41041
KNN_Recall,0.4486
LogisticRegression_Accuracy,0.47664
LogisticRegression_F1,0.46623
LogisticRegression_Precision,0.47885
LogisticRegression_Recall,0.47664
RCC_n10_best_accuracy,0.58879
RCC_n15_best_accuracy,0.54206


# LP Residual

In [28]:
import numpy as np
import librosa

def compute_lp_residual_energy(frame: np.ndarray, order: int = 12) -> float:
    """
    Compute the Linear Prediction (LP) residual energy of a signal frame using librosa.

    Parameters:
    frame (np.ndarray): The input frame of the signal.
    order (int): The order of the LPC analysis (default is 12).

    Returns:
    float: The energy of the LP residual signal.
    """
    try:
        # Step 1: LPC Analysis using librosa to compute LPC coefficients
        a = librosa.lpc(frame, order=order)  # LPC coefficients (a[0] is the gain)

        # Step 2: Compute the residual signal by filtering the frame using LPC coefficients
        residual = lfilter(a, [1.0], frame)

        # Step 3: Compute the energy of the residual signal (sum of squared values)
        residual_energy = np.sum(residual ** 2)

        return residual_energy
    
    except Exception as e:
        print(f"Error computing LP residual energy: {e}")
        return 0.0  # Return zero in case of error


In [29]:
def extract_lp_features(file_path: str,
                        frame_size: float = 0.025,
                        frame_stride: float = 0.01,
                        target_lp_segments: int = 100,
                        lp_order: int = 12) -> np.ndarray:
    """
    Extracts Linear Prediction (LP) residual energy framewise then condenses 
    the features into a fixed-length feature matrix via average pooling.

    Parameters:
        file_path (str): Path to the audio file.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_lp_segments (int): Desired number of pooled segments for LP residual features.
        lp_order (int): Order of the LP analysis.
    
    Returns:
        np.ndarray: Pooled LP residual energies with shape (target_lp_segments, 1).
    """
    try:
        signal, sr = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sr)
        hop_length = int(frame_stride * sr)
        
        # Frame the signal: shape (number_of_frames, frame_length)
        frames = librosa.util.frame(signal, frame_length=frame_length, hop_length=hop_length).T

        lp_list = []
        
        for frame in frames:
            # Apply windowing
            frame = frame * np.hamming(len(frame))
            
            # LP residual energy extraction: using librosa.lpc for LPC coefficients
            try:
                a_lp = librosa.lpc(frame, order=lp_order) 
                residual_lp = lfilter(a_lp, [1.0], frame)
                lp_energy = np.sum(residual_lp ** 2)
            except Exception as ex:
                print(f"Error in LP energy computation for frame: {ex}")
                lp_energy = 0.0
            
            lp_list.append([lp_energy])  # Keep as list for 2D array

        lp_array = np.array(lp_list)  # shape: (num_frames, 1)
        
        # Average pool features to desired length
        lp_segments = np.array_split(lp_array, target_lp_segments, axis=0)
        lp_pooled = np.array([np.mean(seg, axis=0) for seg in lp_segments])
        
        return lp_pooled
    
    except Exception as e:
        print(f"Error processing {file_path} for LP: {e}")
        return np.zeros((target_lp_segments, 1))


In [30]:

def process_directory_lp(directory: str,
                         frame_size: float = 0.025,
                         frame_stride: float = 0.01,
                         target_lp_segments: int = 100,
                         lp_order: int = 12) -> Dict[str, np.ndarray]:
    """
    Processes all .wav files in the directory and extracts LP residual features.

    Parameters:
        directory (str): Path to the directory containing .wav files.
        frame_size (float): Frame duration in seconds.
        frame_stride (float): Step between successive frames in seconds.
        target_lp_segments (int): Desired number of pooled segments for LP residual features.
        lp_order (int): Order of the LP analysis.

    Returns:
        dict: Mapping of filename to flattened LP feature vectors.
    """
    feature_vectors = {}

    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            try:
                lp_features = extract_lp_features(
                    file_path,
                    frame_size=frame_size,
                    frame_stride=frame_stride,
                    target_lp_segments=target_lp_segments,
                    lp_order=lp_order
                )
                lp_flat = lp_features.flatten()  # Shape: (target_lp_segments,)
                feature_vectors[filename] = lp_flat
            except Exception as e:
                print(f"Error processing LP for {file_path}: {e}")

    return feature_vectors

In [31]:

def prepare_lp_dataset(features: dict, labels: pd.DataFrame):
    """
    Prepares LP feature set for classification.

    Parameters:
        features (dict): Dictionary mapping filenames to LP feature vectors.
        labels (pd.DataFrame): DataFrame containing emotion labels.
    
    Returns:
        tuple:
            np.ndarray: LP features.
            np.ndarray: Emotion labels.
    """
    X_lp, y = [], []

    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            # Direct access to feature vector (no nested dictionary)
            X_lp.append(features[file_id])
            y.append(int(row['EmotionNumeric']))

    return np.array(X_lp), np.array(y)

In [None]:
import wandb

# Initialize wandb
wandb.init(project="emotion-recognition", name="ssp-lp-classification")

# Experiment with different segment values for LP features
lp_segment_values = np.arange(20, 120, 15)  # Try different segment values from 20 to 140
labels = pd.read_csv(labels_csv_path)

for n in lp_segment_values:
    print(f"\nRunning for target_lp_segments = {n}")
    
    # Extract LP features with the current number of segments
    features = process_directory_lp(directory, target_lp_segments=n)
    print(f"Number of files processed: {len(features)}")
    
    # Prepare feature sets
    X_lp, y_lp = prepare_lp_dataset(features, labels)
    print("LP-only shape:", X_lp.shape)
    
    # Train and evaluate classifiers, logging metrics to wandb
    print(f"\n--- LP Residual with {n} segments ---")
    metrics = train_and_evaluate(X_lp, y_lp)
    
    # Log metrics to wandb with current segment value
    log_data = {
        "lp_segments": n,
        "feature_type": "LP_Residual",
        "feature_dim": X_lp.shape[1],
        **metrics
    }
    
    wandb.log(log_data)
    
    # Add summary for easy comparison
    wandb.run.summary[f"LP_n{n}_best_accuracy"] = max([
        metrics.get("SVM_Accuracy", 0),
        metrics.get("RandomForest_Accuracy", 0),
        metrics.get("XGBoost_Accuracy", 0),
        metrics.get("LogisticRegression_Accuracy", 0),
        metrics.get("KNN_Accuracy", 0)
    ])

# Finish wandb run
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
KNN_Accuracy,▂██▆▁█
KNN_F1,▃▆█▅▁▇
KNN_Precision,▆▄█▄▁▇
KNN_Recall,▂██▆▁█
LogisticRegression_Accuracy,▂▁▆▅▃█
LogisticRegression_F1,▁▂▆▆▃█
LogisticRegression_Precision,▁▂█▆▂▆
LogisticRegression_Recall,▂▁▆▅▃█
RandomForest_Accuracy,▁▃▃▄██
RandomForest_F1,▁▃▂▂██

0,1
KNN_Accuracy,0.30841
KNN_F1,0.26736
KNN_Precision,0.321
KNN_Recall,0.30841
LP_n100_best_accuracy,0.51402
LP_n120_best_accuracy,0.51402
LP_n20_best_accuracy,0.43925
LP_n40_best_accuracy,0.45794
LP_n60_best_accuracy,0.5514
LP_n80_best_accuracy,0.50467



Running for target_lp_segments = 20
Number of files processed: 535
LP-only shape: (535, 20)

--- LP Residual with 20 segments ---

Training and evaluating: SVM
              precision    recall  f1-score   support

           0       0.29      0.78      0.42        18
           1       0.57      0.20      0.30        20
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        18
           5       0.33      0.11      0.17         9
           6       0.25      0.75      0.38        16

    accuracy                           0.29       107
   macro avg       0.21      0.26      0.18       107
weighted avg       0.22      0.29      0.20       107

Accuracy: 0.2897, Precision: 0.2213, Recall: 0.2897, F1: 0.1968

Training and evaluating: RandomForest
              precision    recall  f1-score   support

           0       0.36      0.83      0.50        18
           1       0.59      0

KeyboardInterrupt: 

# GVV

In [34]:
import os
import numpy as np
import pandas as pd
import librosa
import scipy.signal
from typing import Dict

def extract_gvv_features(file_path: str, frame_size: float = 0.025, frame_stride: float = 0.01, n_segments: int = 10) -> np.ndarray:
    """
    Extracts GVV features from an audio file using a simplified IAIF approach.
    The features are averaged over time into an n x 1 feature matrix.

    Returns:
      np.ndarray: A (n_segments, 1) array of GVV energy values.
    """
    try:
        signal, sample_rate = librosa.load(file_path, sr=None)
        frame_length = int(frame_size * sample_rate)
        hop_length = int(frame_stride * sample_rate)

        # Pre-emphasis
        pre_emphasis = 0.97
        emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

        # LPC Analysis
        lpc_order = 16
        lpc_coeffs = librosa.lpc(emphasized_signal, order=lpc_order)

        # Inverse filtering (glottal excitation)
        glottal_source = scipy.signal.lfilter(lpc_coeffs, [1.0], emphasized_signal)

        # Frame the glottal source signal
        frames = librosa.util.frame(glottal_source, frame_length=frame_length, hop_length=hop_length).T

        # Energy per frame
        frame_energies = np.sum(frames ** 2, axis=1)

        # Normalize energies
        frame_energies -= np.mean(frame_energies)
        frame_energies /= (np.std(frame_energies) + 1e-6)

        # Segment pooling
        segments = np.array_split(frame_energies, n_segments)
        pooled_features = np.array([np.mean(seg) for seg in segments]).reshape(-1, 1)

        return pooled_features

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return np.array([])

In [35]:
def process_directory_gvv(directory: str, n_segments: int = 10) -> Dict[str, np.ndarray]:
    """
    Extracts GVV features from all .wav files in the directory.
    """
    feature_vectors = {}
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            file_path = os.path.join(directory, filename)
            features = extract_gvv_features(file_path, n_segments=n_segments)
            if features.size > 0:
                feature_vectors[filename] = features.flatten()  # Flatten for classifier input
    return feature_vectors

In [None]:
import wandb

# Initialize wandb
wandb.init(project="emotion-recognition", name="ssp-gvv-classification")

# Experiment with different segment values for GVV features
gvv_segment_values = np.arange(5, 100, 10)  # Try different segment values from 5 to 45
labels = pd.read_csv(labels_csv_path)

for n in gvv_segment_values:
    print(f"\nRunning for n_segments = {n}")
    
    # Extract GVV features with the current number of segments
    features = process_directory_gvv(directory, n_segments=n)
    print(f"Number of files processed: {len(features)}")
    
    # Prepare feature sets
    X_gvv, y_gvv = [], []
    for _, row in labels.iterrows():
        file_id = row['Filename']
        if file_id in features:
            X_gvv.append(features[file_id])
            y_gvv.append(int(row['EmotionNumeric']))
    
    X_gvv = np.array(X_gvv)
    y_gvv = np.array(y_gvv)
    
    print("GVV-only shape:", X_gvv.shape)
    
    # Train and evaluate classifiers, logging metrics to wandb
    print(f"\n--- GVV with {n} segments ---")
    metrics = train_and_evaluate(X_gvv, y_gvv)
    
    # Log metrics to wandb with current segment value
    log_data = {
        "gvv_segments": n,
        "feature_type": "GVV",
        "feature_dim": X_gvv.shape[1],
        **metrics
    }
    
    wandb.log(log_data)
    
    # Add summary for easy comparison
    wandb.run.summary[f"GVV_n{n}_best_accuracy"] = max([
        metrics.get("SVM_Accuracy", 0),
        metrics.get("RandomForest_Accuracy", 0),
        metrics.get("XGBoost_Accuracy", 0),
        metrics.get("LogisticRegression_Accuracy", 0),
        metrics.get("KNN_Accuracy", 0)
    ])

# Finish wandb run
wandb.finish()

0,1
KNN_Accuracy,▁█
KNN_F1,▁█
KNN_Precision,█▁
KNN_Recall,▁█
LogisticRegression_Accuracy,█▁
LogisticRegression_F1,▁█
LogisticRegression_Precision,▁█
LogisticRegression_Recall,█▁
RandomForest_Accuracy,▁█
RandomForest_F1,▁█

0,1
KNN_Accuracy,0.30841
KNN_F1,0.2575
KNN_Precision,0.24679
KNN_Recall,0.30841
LP_n20_best_accuracy,0.43925
LP_n40_best_accuracy,0.45794
LogisticRegression_Accuracy,0.28972
LogisticRegression_F1,0.24973
LogisticRegression_Precision,0.31015
LogisticRegression_Recall,0.28972



Running for n_segments = 5
Number of files processed: 535
GVV-only shape: (535, 5)

--- GVV with 5 segments ---

Training and evaluating: SVM
              precision    recall  f1-score   support

           0       0.22      0.83      0.35        18
           1       0.44      0.60      0.51        20
           2       0.00      0.00      0.00        12
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00        18
           5       0.30      0.33      0.32         9
           6       0.50      0.06      0.11        16

    accuracy                           0.29       107
   macro avg       0.21      0.26      0.18       107
weighted avg       0.22      0.29      0.20       107

Accuracy: 0.2897, Precision: 0.2202, Recall: 0.2897, F1: 0.1973

Training and evaluating: RandomForest
              precision    recall  f1-score   support

           0       0.32      0.67      0.43        18
           1       0.39      0.35      0.37     

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


              precision    recall  f1-score   support

           0       0.27      0.56      0.36        18
           1       0.29      0.20      0.24        20
           2       0.00      0.00      0.00        12
           3       0.09      0.07      0.08        14
           4       0.43      0.33      0.38        18
           5       0.47      0.78      0.58         9
           6       0.23      0.19      0.21        16

    accuracy                           0.29       107
   macro avg       0.25      0.30      0.26       107
weighted avg       0.26      0.29      0.26       107

Accuracy: 0.2897, Precision: 0.2566, Recall: 0.2897, F1: 0.2587

Training and evaluating: LogisticRegression
              precision    recall  f1-score   support

           0       0.39      0.72      0.51        18
           1       0.44      0.35      0.39        20
           2       0.00      0.00      0.00        12
           3       0.18      0.14      0.16        14
           4       0.33

0,1
KNN_Accuracy,▇▅█▅▃▄▄▁▅
KNN_F1,█▅▇▆▃▂▄▁▄
KNN_Precision,█▄▅▅▄▁▄▃▄
KNN_Recall,▇▅█▅▃▄▄▁▅
LogisticRegression_Accuracy,▁▅█▅▄▆▆▃▅
LogisticRegression_F1,▁▅█▆▅▆▇▅▆
LogisticRegression_Precision,▁▅█▆▅▇█▅▅
LogisticRegression_Recall,▁▅█▅▄▆▆▃▅
RandomForest_Accuracy,▅█▇▂▅▃▁▂▆
RandomForest_F1,▇█▆▁▃▃▂▁▅

0,1
GVV_n10_best_accuracy,0.39252
GVV_n15_best_accuracy,0.40187
GVV_n20_best_accuracy,0.35514
GVV_n25_best_accuracy,0.33645
GVV_n30_best_accuracy,0.36449
GVV_n35_best_accuracy,0.36449
GVV_n40_best_accuracy,0.34579
GVV_n45_best_accuracy,0.35514
GVV_n5_best_accuracy,0.33645
KNN_Accuracy,0.28972


# Combination of features

In [None]:
# import wandb
# import os
# import numpy as np
# import pandas as pd

# # Initialize wandb
# wandb.init(project="emotion-recognition", name="ssp-combined-features")

# # Define best parameter settings for each feature type based on previous experiments
# mfcc_segments = 15  # Adjust based on your best MFCC results
# rcc_segments = 15   # Adjust based on your best RCC results
# lp_segments = 60    # Adjust based on your best LP results
# gvv_segments = 25   # Adjust based on your best GVV results

# labels = pd.read_csv(labels_csv_path)

# # Extract features with the optimal segment values
# print("Extracting MFCC features...")
# mfcc_features = process_directory_mfcc(directory, n_segments=mfcc_segments)

# print("Extracting RCC features...")
# rcc_features = process_directory_rcc(directory, target_rcc_segments=rcc_segments)

# print("Extracting LP features...")
# lp_features = process_directory_lp(directory, target_lp_segments=lp_segments)

# print("Extracting GVV features...")
# gvv_features = process_directory_gvv(directory, n_segments=gvv_segments)

# # Combine features by concatenation
# combined_features = {}
# feature_counts = {
#     'total': 0,
#     'mfcc_only': 0,
#     'rcc_only': 0,
#     'lp_only': 0,
#     'gvv_only': 0,
#     'combined': 0
# }

# for filename in os.listdir(directory):
#     if filename.endswith('.wav'):
#         feature_counts['total'] += 1
        
#         # For each file, collect available features
#         vectors = []
        
#         if filename in mfcc_features:
#             mfcc_vec = mfcc_features[filename].flatten()
#             vectors.append(mfcc_vec)
#             if len(vectors) == 1:
#                 feature_counts['mfcc_only'] += 1
        
#         if filename in rcc_features:
#             rcc_vec = rcc_features[filename]  # Already flattened
#             vectors.append(rcc_vec)
#             if len(vectors) == 1:
#                 feature_counts['rcc_only'] += 1
        
#         if filename in lp_features:
#             lp_vec = lp_features[filename]  # Already flattened
#             vectors.append(lp_vec)
#             if len(vectors) == 1:
#                 feature_counts['lp_only'] += 1
        
#         if filename in gvv_features:
#             gvv_vec = gvv_features[filename]  # Already flattened
#             vectors.append(gvv_vec)
#             if len(vectors) == 1:
#                 feature_counts['gvv_only'] += 1
        
#         # Only include files that have all feature types
#         if len(vectors) == 4:  # All feature types available
#             combined = np.concatenate(vectors)
#             combined_features[filename] = combined
#             feature_counts['combined'] += 1

# print(f"Feature counts: {feature_counts}")
# print(f"Number of files with all features: {len(combined_features)}")

# # Prepare dataset using the combined feature vectors
# X_combined, y_combined = [], []
# for _, row in labels.iterrows():
#     file_id = row['Filename']
#     if file_id in combined_features:
#         X_combined.append(combined_features[file_id])
#         y_combined.append(int(row['EmotionNumeric']))

# X_combined = np.array(X_combined)
# y_combined = np.array(y_combined)

# print("Combined dataset shape:", X_combined.shape)

# # Train and evaluate classifiers on the combined feature set
# print("\n--- Combined Features Classification ---")
# metrics = train_and_evaluate(X_combined, y_combined)

# # Log metrics to wandb
# feature_dimensions = {
#     'mfcc_dim': mfcc_segments * 39,
#     'rcc_dim': rcc_segments * 12,
#     'lp_dim': lp_segments,
#     'gvv_dim': gvv_segments,
#     'total_dim': X_combined.shape[1]
# }

# log_data = {
#     'feature_type': 'Combined',
#     **feature_dimensions,
#     'dataset_size': len(X_combined),
#     **metrics
# }

# wandb.log(log_data)

# # Add summary for easy comparison
# wandb.run.summary["Combined_best_accuracy"] = max([
#     metrics.get("SVM_Accuracy", 0),
#     metrics.get("RandomForest_Accuracy", 0),
#     metrics.get("XGBoost_Accuracy", 0),
#     metrics.get("LogisticRegression_Accuracy", 0),
#     metrics.get("KNN_Accuracy", 0)
# ])

# # Finish wandb run
# wandb.finish()

Extracting MFCC features...
Extracting RCC features...
Extracting LP features...
Extracting GVV features...
Feature counts: {'total': 535, 'mfcc_only': 535, 'rcc_only': 0, 'lp_only': 0, 'gvv_only': 0, 'combined': 535}
Number of files with all features: 535
Combined dataset shape: (535, 850)

--- Combined Features Classification ---

Training and evaluating: SVM
              precision    recall  f1-score   support

           0       0.57      0.72      0.63        18
           1       0.50      0.40      0.44        20
           2       0.11      0.08      0.10        12
           3       0.60      0.64      0.62        14
           4       0.57      0.44      0.50        18
           5       0.50      0.67      0.57         9
           6       0.50      0.56      0.53        16

    accuracy                           0.50       107
   macro avg       0.48      0.50      0.49       107
weighted avg       0.49      0.50      0.49       107

Accuracy: 0.5047, Precision: 0.4925, Re

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


              precision    recall  f1-score   support

           0       0.69      0.61      0.65        18
           1       0.67      0.40      0.50        20
           2       0.25      0.25      0.25        12
           3       0.42      0.57      0.48        14
           4       0.42      0.28      0.33        18
           5       0.38      0.56      0.45         9
           6       0.43      0.62      0.51        16

    accuracy                           0.47       107
   macro avg       0.47      0.47      0.45       107
weighted avg       0.49      0.47      0.46       107

Accuracy: 0.4673, Precision: 0.4909, Recall: 0.4673, F1: 0.4648

Training and evaluating: KNN
              precision    recall  f1-score   support

           0       0.43      0.83      0.57        18
           1       0.42      0.40      0.41        20
           2       0.33      0.08      0.13        12
           3       0.44      0.29      0.35        14
           4       0.67      0.22     

0,1
KNN_Accuracy,▁
KNN_F1,▁
KNN_Precision,▁
KNN_Recall,▁
LogisticRegression_Accuracy,▁
LogisticRegression_F1,▁
LogisticRegression_Precision,▁
LogisticRegression_Recall,▁
RandomForest_Accuracy,▁
RandomForest_F1,▁

0,1
Combined_best_accuracy,0.54206
KNN_Accuracy,0.43925
KNN_F1,0.40641
KNN_Precision,0.47128
KNN_Recall,0.43925
LogisticRegression_Accuracy,0.46729
LogisticRegression_F1,0.46478
LogisticRegression_Precision,0.49085
LogisticRegression_Recall,0.46729
RandomForest_Accuracy,0.52336
