# Emotion Recognition from Speech - Full Pipeline

This notebook implements the complete emotion recognition pipeline:
- Data loading and exploration
- Feature extraction (MFCCs & Mel-spectrograms)
- Data preprocessing and augmentation
- Train/validation/test splitting
- Baseline CNN and CNN-LSTM hybrid models
- Model evaluation and comparison

---

## 1. Import Libraries and Set Configuration

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# TensorFlow/Keras imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set up plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Configuration and Setup

In [None]:
# Configuration
CONFIG = {
    'data_dir': 'data/Audio_Song_Actors_01-24_Actors_1_to_17/Audio_Song_Actors_01-24',
    'n_mels': 128,
    'n_mfcc': 13,
    'sample_rate': 22050,
    'n_fft': 2048,
    'hop_length': 512,
    'emotion_map': {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    },
    'target_emotions': ['happy', 'sad', 'angry', 'neutral'],
    'test_size': 0.2,
    'val_size': 0.2,
    'batch_size': 16,          # Reduced for better convergence
    'epochs': 100,             # Increased for better training
    'learning_rate': 0.0005,   # Lower learning rate
}

print('Configuration loaded:')
for key, value in CONFIG.items():
    print(f'  {key}: {value}')

## 3. Data Loading and Exploration

In [None]:
def parse_ravdess_filename(filename, emotion_map):
    """Parse RAVDESS filename to extract emotion and metadata."""
    parts = filename.split('-')
    emotion_code = parts[2]
    emotion = emotion_map.get(emotion_code, 'unknown')
    actor = int(parts[6].replace('.wav', ''))
    return emotion, actor

def load_dataset(data_dir, config):
    """Load audio files and labels from the dataset."""
    files = []
    labels = []
    target_emotions = config['target_emotions']
    emotion_map = config['emotion_map']
    
    actor_dirs = sorted([d for d in os.listdir(data_dir) if d.startswith('Actor_')])
    print(f'Found {len(actor_dirs)} actor directories')
    
    for actor_dir in actor_dirs:
        actor_path = os.path.join(data_dir, actor_dir)
        audio_files = [f for f in os.listdir(actor_path) if f.endswith('.wav')]
        
        for audio_file in audio_files:
            emotion, actor_id = parse_ravdess_filename(audio_file, emotion_map)
            if emotion in target_emotions:
                files.append(os.path.join(actor_path, audio_file))
                labels.append(emotion)
    
    print(f'Loaded {len(files)} audio files')
    print(f'Emotion distribution:')
    for emotion in target_emotions:
        count = labels.count(emotion)
        print(f'  {emotion}: {count}')
    
    return files, labels

# Load dataset
audio_files, emotion_labels = load_dataset(CONFIG['data_dir'], CONFIG)

## 4. Feature Extraction

In [None]:
def extract_mfcc(audio_path, config):
    """Extract MFCC features from audio file."""
    try:
        y, sr = librosa.load(audio_path, sr=config['sample_rate'])
        mfcc = librosa.feature.mfcc(
            y=y,
            sr=sr,
            n_mfcc=config['n_mfcc'],
            n_fft=config['n_fft'],
            hop_length=config['hop_length']
        )
        return np.mean(mfcc, axis=1)
    except Exception as e:
        print(f'Error processing {audio_path}: {e}')
        return None

def extract_melspectrogram(audio_path, config):
    """Extract Mel-spectrogram features from audio file."""
    try:
        y, sr = librosa.load(audio_path, sr=config['sample_rate'])
        mel_spec = librosa.feature.melspectrogram(
            y=y,
            sr=sr,
            n_mels=config['n_mels'],
            n_fft=config['n_fft'],
            hop_length=config['hop_length']
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        return np.mean(mel_spec_db, axis=1)
    except Exception as e:
        print(f'Error processing {audio_path}: {e}')
        return None

print('Extracting MFCC features...')
mfcc_features = []
for i, audio_file in enumerate(audio_files):
    if (i + 1) % 50 == 0:
        print(f'  Processed {i + 1}/{len(audio_files)} files')
    mfcc = extract_mfcc(audio_file, CONFIG)
    if mfcc is not None:
        mfcc_features.append(mfcc)
    else:
        mfcc_features.append(np.zeros(CONFIG['n_mfcc']))

print('Extracting Mel-spectrogram features...')
mel_features = []
for i, audio_file in enumerate(audio_files):
    if (i + 1) % 50 == 0:
        print(f'  Processed {i + 1}/{len(audio_files)} files')
    mel = extract_melspectrogram(audio_file, CONFIG)
    if mel is not None:
        mel_features.append(mel)
    else:
        mel_features.append(np.zeros(CONFIG['n_mels']))

X_mfcc = np.array(mfcc_features)
X_mel = np.array(mel_features)
y = np.array(emotion_labels)

print(f'MFCC features shape: {X_mfcc.shape}')
print(f'Mel features shape: {X_mel.shape}')
print(f'Labels shape: {y.shape}')

## 5. Data Preprocessing and Splitting

In [None]:
# Create emotion-to-label mapping
emotion_list = sorted(list(set(emotion_labels)))
emotion_to_idx = {emotion: idx for idx, emotion in enumerate(emotion_list)}
y_encoded = np.array([emotion_to_idx[e] for e in emotion_labels])
y_categorical = to_categorical(y_encoded, num_classes=len(emotion_list))

print(f'Emotion mapping: {emotion_to_idx}')
print(f'Number of classes: {len(emotion_list)}')

# Split data: train (60%), validation (20%), test (20%)
X_mfcc_train, X_mfcc_temp, y_train, y_temp = train_test_split(
    X_mfcc, y_categorical, test_size=0.4, random_state=42, stratify=y_encoded
)
X_mfcc_val, X_mfcc_test, y_val, y_test = train_test_split(
    X_mfcc_temp, y_temp, test_size=0.5, random_state=42, stratify=np.argmax(y_temp, axis=1)
)

X_mel_train, _, _, _ = train_test_split(
    X_mel, y_categorical, test_size=0.4, random_state=42, stratify=y_encoded
)
X_mel_val = X_mel[len(X_mfcc_train):len(X_mfcc_train) + len(X_mfcc_val)]
X_mel_test = X_mel[len(X_mfcc_train) + len(X_mfcc_val):]

# Normalize features
scaler_mfcc = StandardScaler()
X_mfcc_train = scaler_mfcc.fit_transform(X_mfcc_train)
X_mfcc_val = scaler_mfcc.transform(X_mfcc_val)
X_mfcc_test = scaler_mfcc.transform(X_mfcc_test)

scaler_mel = StandardScaler()
X_mel_train = scaler_mel.fit_transform(X_mel_train)
X_mel_val = scaler_mel.transform(X_mel_val)
X_mel_test = scaler_mel.transform(X_mel_test)

print(f'Training set: {X_mfcc_train.shape}')
print(f'Validation set: {X_mfcc_val.shape}')
print(f'Test set: {X_mfcc_test.shape}')

## 6. Build Baseline CNN Model

In [None]:
def build_baseline_cnn(input_shape, num_classes):
    """Build baseline CNN model."""
    model = models.Sequential([
        layers.Reshape((input_shape, 1), input_shape=(input_shape,)),
        layers.Conv1D(32, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.3),
        
        layers.Conv1D(64, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.3),
        
        layers.Conv1D(128, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.3),
        
        layers.GlobalAveragePooling1D(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Build and compile baseline CNN
baseline_cnn = build_baseline_cnn(X_mfcc_train.shape[1], len(emotion_list))
baseline_cnn.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
baseline_cnn.summary()

## 7. Build CNN-LSTM Hybrid Model

In [None]:
def build_cnn_lstm_hybrid(input_shape, num_classes):
    """Build CNN-LSTM hybrid model for temporal emotion recognition."""
    model = models.Sequential([
        # Reshape input for Conv1D
        layers.Reshape((input_shape, 1), input_shape=(input_shape,)),
        
        # CNN layers for feature extraction
        layers.Conv1D(32, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.2),
        
        layers.Conv1D(64, kernel_size=3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(pool_size=2),
        layers.Dropout(0.2),
        
        # LSTM layer for temporal dynamics
        layers.LSTM(128, return_sequences=True, activation='relu'),
        layers.Dropout(0.2),
        layers.LSTM(64, activation='relu'),
        layers.Dropout(0.2),
        
        # Dense layers for classification
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Build and compile CNN-LSTM hybrid
cnn_lstm = build_cnn_lstm_hybrid(X_mfcc_train.shape[1], len(emotion_list))
cnn_lstm.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0005),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
cnn_lstm.summary()

## 8. Train Baseline CNN on MFCC Features

In [None]:
print('Training Baseline CNN on MFCC features...')
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
]

history_baseline_mfcc = baseline_cnn.fit(
    X_mfcc_train, y_train,
    validation_data=(X_mfcc_val, y_val),
    epochs=CONFIG['epochs'],
    batch_size=CONFIG['batch_size'],
    callbacks=callbacks,
    verbose=1
)

print('Training complete!')

## 9. Train CNN-LSTM Hybrid on Mel-Spectrogram Features

In [None]:
print('Training CNN-LSTM Hybrid on Mel-Spectrogram features...')

history_cnn_lstm = cnn_lstm.fit(
    X_mel_train, y_train,
    validation_data=(X_mel_val, y_val),
    epochs=CONFIG['epochs'],
    batch_size=CONFIG['batch_size'],
    callbacks=callbacks,
    verbose=1
)

print('Training complete!')

## 10. Model Evaluation and Comparison

In [None]:
# Evaluate Baseline CNN on test set
y_pred_baseline = baseline_cnn.predict(X_mfcc_test)
y_pred_baseline_labels = np.argmax(y_pred_baseline, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

baseline_accuracy = accuracy_score(y_test_labels, y_pred_baseline_labels)
baseline_f1 = f1_score(y_test_labels, y_pred_baseline_labels, average='weighted')

print('=== BASELINE CNN (MFCC) ===')
print(f'Accuracy: {baseline_accuracy:.4f}')
print(f'F1-Score: {baseline_f1:.4f}')
print('\nClassification Report:')
print(classification_report(y_test_labels, y_pred_baseline_labels, target_names=emotion_list))

# Evaluate CNN-LSTM on test set
y_pred_lstm = cnn_lstm.predict(X_mel_test)
y_pred_lstm_labels = np.argmax(y_pred_lstm, axis=1)

lstm_accuracy = accuracy_score(y_test_labels, y_pred_lstm_labels)
lstm_f1 = f1_score(y_test_labels, y_pred_lstm_labels, average='weighted')

print('\n=== CNN-LSTM HYBRID (Mel-Spectrogram) ===')
print(f'Accuracy: {lstm_accuracy:.4f}')
print(f'F1-Score: {lstm_f1:.4f}')
print('\nClassification Report:')
print(classification_report(y_test_labels, y_pred_lstm_labels, target_names=emotion_list))

## 11. Visualize Results

In [None]:
# Plot training history for Baseline CNN
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(history_baseline_mfcc.history['loss'], label='Train Loss')
axes[0].plot(history_baseline_mfcc.history['val_loss'], label='Val Loss')
axes[0].set_title('Baseline CNN - Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history_baseline_mfcc.history['accuracy'], label='Train Accuracy')
axes[1].plot(history_baseline_mfcc.history['val_accuracy'], label='Val Accuracy')
axes[1].set_title('Baseline CNN - Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Plot training history for CNN-LSTM
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(history_cnn_lstm.history['loss'], label='Train Loss')
axes[0].plot(history_cnn_lstm.history['val_loss'], label='Val Loss')
axes[0].set_title('CNN-LSTM Hybrid - Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history_cnn_lstm.history['accuracy'], label='Train Accuracy')
axes[1].plot(history_cnn_lstm.history['val_accuracy'], label='Val Accuracy')
axes[1].set_title('CNN-LSTM Hybrid - Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

## 12. Confusion Matrices

In [None]:
# Confusion Matrix for Baseline CNN
cm_baseline = confusion_matrix(y_test_labels, y_pred_baseline_labels)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=emotion_list, yticklabels=emotion_list)
axes[0].set_title('Baseline CNN Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Confusion Matrix for CNN-LSTM
cm_lstm = confusion_matrix(y_test_labels, y_pred_lstm_labels)

sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=emotion_list, yticklabels=emotion_list)
axes[1].set_title('CNN-LSTM Hybrid Confusion Matrix')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 13. Model Comparison

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['Baseline CNN (MFCC)', 'CNN-LSTM Hybrid (Mel-Spec)'],
    'Accuracy': [baseline_accuracy, lstm_accuracy],
    'F1-Score': [baseline_f1, lstm_f1],
    'Feature Type': ['MFCC', 'Mel-Spectrogram'],
    'Architecture': ['3x Conv1D', 'Conv1D + LSTM']
})

print('\n=== MODEL COMPARISON ===')
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(comparison_df))
width = 0.35

ax.bar(x - width/2, comparison_df['Accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x + width/2, comparison_df['F1-Score'], width, label='F1-Score', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'])
ax.legend()
ax.set_ylim([0, 1])
ax.grid(True, alpha=0.3)

for i, (acc, f1) in enumerate(zip(comparison_df['Accuracy'], comparison_df['F1-Score'])):
    ax.text(i - width/2, acc + 0.02, f'{acc:.3f}', ha='center')
    ax.text(i + width/2, f1 + 0.02, f'{f1:.3f}', ha='center')

plt.tight_layout()
plt.show()

## 14. Save Models

In [None]:
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save models
baseline_cnn.save('models/baseline_cnn_mfcc.h5')
cnn_lstm.save('models/cnn_lstm_mel.h5')

print('Models saved successfully!')
print('  - models/baseline_cnn_mfcc.h5')
print('  - models/cnn_lstm_mel.h5')