# Emotion Recognition Pipeline - Improved (Fixed)

Fixed version with better accuracy:
- MFCC features
- Proper data augmentation
- Class weights
- Better architecture
- Lower learning rate

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

np.random.seed(42)
tf.random.set_seed(42)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries loaded!')

In [None]:
# Load audio files
data_dir = 'data/Audio_Song_Actors_01-24_Actors_1_to_17'
target_emotions = ['happy', 'sad', 'angry', 'neutral']
emotion_map = {'01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad', '05': 'angry'}

files = []
labels = []

for actor_dir in sorted(os.listdir(data_dir)):
    if not actor_dir.startswith('Actor_'):
        continue
    actor_path = os.path.join(data_dir, actor_dir)
    for fname in os.listdir(actor_path):
        if fname.endswith('.wav'):
            parts = fname.split('-')
            emotion = emotion_map.get(parts[2], 'unknown')
            if emotion in target_emotions:
                files.append(os.path.join(actor_path, fname))
                labels.append(emotion)

print(f'Loaded {len(files)} files')
print(f'Emotions: {pd.Series(labels).value_counts().to_dict()}')

In [None]:
# Extract MFCC features
sr = 22050
n_mfcc = 13

print('Extracting MFCC...')
mfcc_list = []
valid_labels = []

for i, f in enumerate(files):
    if (i + 1) % 50 == 0:
        print(f'  {i + 1}/{len(files)}')
    try:
        y, _ = librosa.load(f, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_list.append(mfcc.T)  # (time_steps, n_mfcc)
        valid_labels.append(labels[i])
    except:
        pass

print(f'Extracted {len(mfcc_list)} features')

# Pad to same length
max_len = max(len(m) for m in mfcc_list)
X = np.zeros((len(mfcc_list), max_len, n_mfcc))
for i, m in enumerate(mfcc_list):
    X[i, :len(m), :] = m

y = np.array(valid_labels)
print(f'Data shape: {X.shape}')

In [None]:
# Prepare data
emotion_list = sorted(list(set(y)))
emotion_to_idx = {e: i for i, e in enumerate(emotion_list)}
y_encoded = np.array([emotion_to_idx[e] for e in y])
y_cat = to_categorical(y_encoded, len(emotion_list))

print(f'Classes: {emotion_list}')

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_encoded
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42, stratify=np.argmax(y_train, axis=1)
)

# Normalize
scaler = StandardScaler()
X_train_shape = X_train.shape
X_train_flat = scaler.fit_transform(X_train.reshape(-1, n_mfcc))
X_train = X_train_flat.reshape(X_train_shape)

X_val_flat = scaler.transform(X_val.reshape(-1, n_mfcc))
X_val = X_val_flat.reshape(X_val.shape)

X_test_flat = scaler.transform(X_test.reshape(-1, n_mfcc))
X_test = X_test_flat.reshape(X_test.shape)

print(f'Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}')

In [None]:
# Build model
model = models.Sequential([
    layers.Conv1D(64, 5, activation='relu', padding='same', input_shape=(X_train.shape[1], n_mfcc)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    
    layers.LSTM(256, return_sequences=True),
    layers.Dropout(0.3),
    
    layers.LSTM(128),
    layers.Dropout(0.3),
    
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.4),
    
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.4),
    
    layers.Dense(len(emotion_list), activation='softmax')
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

In [None]:
# Data augmentation - FIXED
print('Augmenting data...')
X_aug_list = []
y_aug_list = []

# Add original data
for x, yy in zip(X_train, y_train):
    X_aug_list.append(x)
    y_aug_list.append(yy)

# Add augmented copies
for x, yy in zip(X_train, y_train):
    # Noise
    X_aug_list.append(x + np.random.normal(0, 0.02, x.shape))
    y_aug_list.append(yy)
    
    # Time shift
    X_aug_list.append(np.roll(x, 2, axis=0))
    y_aug_list.append(yy)

X_aug = np.array(X_aug_list)
y_aug = np.array(y_aug_list)
print(f'Augmented: {X_aug.shape}')

# Class weights
y_train_labels = np.argmax(y_train, axis=1)
cw = compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
cw_dict = dict(enumerate(cw))
print(f'Class weights: {cw_dict}')

In [None]:
# Train
print('Training...')
callbacks = [
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7, verbose=1)
]

history = model.fit(
    X_aug, y_aug,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=16,
    class_weight=cw_dict,
    callbacks=callbacks,
    verbose=1
)

print('Done!')

In [None]:
# Evaluate
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

acc = accuracy_score(y_test_labels, y_pred_labels)
f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')

print(f'\n=== RESULTS ===')
print(f'Accuracy: {acc:.4f}')
print(f'F1-Score: {f1:.4f}')
print(f'\nReport:')
print(classification_report(y_test_labels, y_pred_labels, target_names=emotion_list))

In [None]:
# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].plot(history.history['loss'], label='Train')
axes[0].plot(history.history['val_loss'], label='Val')
axes[0].set_title('Loss')
axes[0].set_xlabel('Epoch')
axes[0].legend()
axes[0].grid()

axes[1].plot(history.history['accuracy'], label='Train')
axes[1].plot(history.history['val_accuracy'], label='Val')
axes[1].set_title('Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].legend()
axes[1].grid()

plt.tight_layout()
plt.show()

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test_labels, y_pred_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_list, yticklabels=emotion_list)
plt.title('Confusion Matrix')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()