In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from IPython.display import Audio
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow.keras.layers as L
from tensorflow.keras.layers import Dense 
import tensorflow as tf
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

## Dataset

In [None]:
path = "../speech_emotion_dataset"


In [None]:
def load_and_listen(audio_path, sample_rate=22050):
    # Load the audio
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Plot the waveform
    plt.figure(figsize=(14, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title(f"Waveform of {audio_path}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.grid(True)
    plt.show()

    # Play the audio
    return Audio(y, rate=sr)


In [None]:
# Locate the folder with .wav files
audio_dir = os.path.join(path, "Crema")  # Make sure this folder name matches what you see

# List all .wav files
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(".wav")]
print(f"Total audio files: {len(audio_files)}")

# Mapping of emotion codes to labels
emotion_map = {
    "ANG": "Angry",
    "DIS": "Disgust",
    "FEA": "Fear",
    "HAP": "Happy",
    "NEU": "Neutral",
    "SAD": "Sad"
}

# Extract a sample of one file per emotion
samples = {}
for file in audio_files:
    parts = file.split('_')
    emotion_code = parts[2]
    if emotion_code in emotion_map and emotion_code not in samples:
        samples[emotion_code] = file

# Plot and listen to one file per emotion
for code, filename in samples.items():
    filepath = os.path.join(audio_dir, filename)
    signal, sr = librosa.load(filepath, sr=None)
    
    # Plot waveform
    plt.figure(figsize=(10, 3))
    librosa.display.waveshow(signal, sr=sr)
    plt.title(f"{emotion_map[code]} - {filename}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()
    
    # Play audio (if running in Jupyter Notebook)
    try:
        from IPython.display import Audio, display
        display(Audio(filepath))
    except:
        pass

### Data Augmentation

In [None]:
def add_noise(data, random=False, rate=0.035, threshold=0.075):
    if random:
        rate = np.random.random() * threshold
    # noise = rate * np.random.uniform() * np.amax(data)
    noise = rate * np.amax(data) * np.random.normal(0, 1, size=data.shape)
    augmented_data = data + noise * np.random.normal(size=data.shape[0])
    return augmented_data

def shifting(data, rate=1000, wrap=False):
    shift_amount = int(np.random.uniform(low=-5, high=5) * rate)
    if wrap:
        return np.roll(data, shift_amount)
    else:
        if shift_amount > 0:
            return np.concatenate((np.zeros(shift_amount), data[:-shift_amount]))
        else:
            return np.concatenate((data[-shift_amount:], np.zeros(-shift_amount)))


def pitching(y, sr, n_steps=4):
    y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    return y_shifted

def stretching(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)


In [None]:
def extract_features(filepath, sr=16000, frame_length=2048, hop_length=512, n_mels=128, augment = False):
    # Load audio
    signal, sr = librosa.load(filepath, sr=sr)  # Use desired sampling rate
    # Trim silent edges
    signal, _ = librosa.effects.trim(signal)
    
    if augment:
        # Randomly choose one augmentation (or apply all if desired)
        aug_type = np.random.choice(['noise', 'shift', 'pitch', 'stretch'])
        if aug_type == 'noise':
            signal = add_noise(signal, random=True)
        elif aug_type == 'shift':
            signal = shifting(signal)
        elif aug_type == 'pitch':
            signal = pitching(signal, sr)
        elif aug_type == 'stretch':
            try:
                signal = stretching(signal, rate=np.random.uniform(0.8, 1.2))
            except:
                pass  # stretching can sometimes result in shape mismatch    

    # --- Zero Crossing Rate ---
    zcr = librosa.feature.zero_crossing_rate(
        y=signal, frame_length=frame_length, hop_length=hop_length
    )[0]  # shape: (frames,)

    # --- Energy (normalized) ---
    energy = np.array([
        np.sum(signal[i:i+frame_length]**2) / frame_length
        for i in range(0, len(signal) - frame_length + 1, hop_length)
    ])
    
    # --- Mel Spectrogram ---
    mel_spec = librosa.feature.melspectrogram(
        y=signal, sr=sr, n_mels=n_mels,
        n_fft=frame_length, hop_length=hop_length
    )

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    return zcr, energy, mel_spec_db

In [None]:
# Test on a sample
test_file = os.path.join(audio_dir, random.choice(audio_files))
zcr, energy, mel_spec = extract_features(test_file, augment=True)
fs1 = zcr , energy
fs2 = mel_spec
# Plot features
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.plot(zcr)
plt.title("Zero Crossing Rate")

plt.subplot(1, 3, 2)
plt.plot(energy)
plt.title("Energy")

plt.subplot(1, 3, 3)
librosa.display.specshow(mel_spec, sr=16000, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title("Mel Spectrogram")

plt.tight_layout()
plt.show()

In [None]:
# Prepare containers
zcr_list = []
energy_list = []
mel_spec_list = []
labels = []

valid_emotions = {"ANG", "DIS", "FEA", "HAP", "NEU", "SAD"}

# Loop through files
for file in tqdm(audio_files):
    parts = file.split('_')
    emotion_code = parts[2]

    if emotion_code not in valid_emotions:
        continue 

    filepath = os.path.join(audio_dir, file)
    
    try:
        # Original (non-augmented) data
        zcr, energy, mel_spec = extract_features(filepath, augment=False)
        zcr_list.append(zcr)
        energy_list.append(energy)
        mel_spec_list.append(mel_spec)
        labels.append(emotion_code)

        # Augmented version (1 sample per original only for 1D features)
        zcr_aug, energy_aug, mel_spec_aug = extract_features(filepath, augment=True)
        zcr_list.append(zcr_aug)
        energy_list.append(energy_aug)
        mel_spec_list.append(mel_spec)
        labels.append(emotion_code)  # Same label
        
    except Exception as e:
        print(f"Failed for {file}: {e}")

In [None]:
X =zcr_list, energy_list, mel_spec_list
print(len(zcr_list), len(energy_list), len(mel_spec_list))

le = LabelEncoder()
label_array = le.fit_transform(labels)

# Get mapping
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_map)
label_array.shape
print(label_array.shape)

# Convert to DataFrame
df = pd.DataFrame({
    'zcr': zcr_list,
    'energy': energy_list,
    'mel_spec': mel_spec_list,
    'label': label_array
})

X = df.drop(columns=['label'])
y = df['label']

print("Shape of features:", X.shape)
print("Shape of labels:", y.shape)


### Splitting the data

In [None]:
# 1. Train+Val (70%) and Test (30%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

# 2. From Train+Val, get 5% as validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.05,  # 5% of 70% ≈ 3.5% of total data
    stratify=y_temp,
    random_state=42
)

# Show shapes
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

### Create the feature space

In [None]:
"""First feature Space""" 
X_train1 = X_train[X_train.columns[0:2]]
X_val1   = X_val[X_val.columns[0:2]]
X_test1  = X_test[X_test.columns[0:2]]

"""Second feature space"""
X_train2 = X_train[X_train.columns[2:]]
X_val2   = X_val[X_val.columns[2:]]
X_test2  = X_test[X_test.columns[2:]]

print("Train shape1:", X_train1.shape)
print("Validation shape1:", X_val1.shape)
print("Test shape1:", X_test1.shape)

print("Train shape2:", X_train2.shape)
print("Validation shape2:", X_val2.shape)
print("Test shape2:", X_test2.shape)

### Prepare Data for Model Input

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad 1D features
def stack_and_pad_1d(features1, features2, max_len=400):
    x1 = pad_sequences(features1, maxlen=max_len, padding='post', dtype='float32')
    x2 = pad_sequences(features2, maxlen=max_len, padding='post', dtype='float32')
    return np.concatenate([x1, x2], axis=-1)

X_train1_padded = stack_and_pad_1d(X_train1['zcr'], X_train1['energy'])
X_val1_padded = stack_and_pad_1d(X_val1['zcr'], X_val1['energy'])
X_test1_padded = stack_and_pad_1d(X_test1['zcr'], X_test1['energy'])

# Pad 2D mel spectrograms
def pad_melspec_2d(mels, max_time=400, max_freq=None):
    padded = []
    for m in mels:
        # Determine the frequency dimension size if not provided
        if max_freq is None:
            max_freq = m.shape[0]
        
        # Pad time dimension
        if m.shape[1] < max_time:
            pad_width_time = max_time - m.shape[1]
            m_padded = np.pad(m, ((0, 0), (0, pad_width_time)), mode='constant')
        else:
            m_padded = m[:, :max_time]
        
        # Pad frequency dimension if necessary
        if m.shape[0] < max_freq:
            pad_width_freq = max_freq - m.shape[0]
            m_padded = np.pad(m_padded, ((0, pad_width_freq), (0, 0)), mode='constant')
        else:
            m_padded = m_padded[:max_freq, :]
        
        padded.append(m_padded)
    
    return np.array(padded)[..., np.newaxis]  # Add channel dimension for 2D CNNs

# Apply the function to pad mel spectrograms
X_train2_padded = pad_melspec_2d(X_train2['mel_spec'])
X_val2_padded = pad_melspec_2d(X_val2['mel_spec'])
X_test2_padded = pad_melspec_2d(X_test2['mel_spec'])

# Convert labels
y_train_arr = tf.keras.utils.to_categorical(y_train)
y_val_arr = tf.keras.utils.to_categorical(y_val)
y_test_arr = tf.keras.utils.to_categorical(y_test)

# Print shapes to verify
print("Train shape:", X_train1_padded.shape)
print("Validation shape:", X_val1_padded.shape)
print("Test shape:", X_test1_padded.shape)

print("Train 2D shape:", X_train2_padded.shape)
print("Validation 2D shape:", X_val2_padded.shape)
print("Test 2D shape:", X_test2_padded.shape)

## Define CNN Models

### 1D CNN (ZCR + Energy)

In [None]:
def build_1d_cnn():
    inputs = tf.keras.Input(shape=(800, 1))

    x = L.Conv1D(512, kernel_size=5, strides=1, padding='same', activation='relu')(inputs)
    x = L.MaxPool1D(pool_size=3, strides=2, padding='same')(x)

    # Block 1
    x = L.Conv1D(512, kernel_size=5, padding='same', activation='relu')(x)
    x = L.Conv1D(512, kernel_size=3, padding='same', activation='relu')(x)
    x = L.MaxPool1D(pool_size=3, strides=2, padding='same')(x)
    x = L.Dropout(0.3)(x)

    # Block 2
    x = L.Conv1D(256, kernel_size=3, padding='same', activation='relu')(x)
    x = L.Conv1D(256, kernel_size=3, padding='same', activation='relu')(x)
    x = L.MaxPool1D(pool_size=3, strides=2, padding='same')(x)
    x = L.Dropout(0.3)(x)

    # Block 3
    x = L.Conv1D(128, kernel_size=3, padding='same', activation='relu')(x)
    x = L.MaxPool1D(pool_size=2, strides=2, padding='same')(x)

    # Final Layers
    x = L.GlobalAveragePooling1D()(x)  # Less prone to overfitting than Flatten
    x = L.Dense(512, activation='relu')(x)
    outputs = L.Dense(6, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)
    return model

### 2D CNN (Mel Spectrogram)

In [None]:
def build_2d_cnn():
    inputs = tf.keras.Input(shape=(128, 400, 1))  # Mel-spectrogram shape

    x = L.Conv2D(512, kernel_size=(5, 5), strides=(1, 1), padding='same', activation='relu')(inputs)
    x = L.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)

    # Block 1
    x = L.Conv2D(512, kernel_size=(5, 5), padding='same', activation='relu')(x)
    x = L.Conv2D(512, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = L.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    x = L.Dropout(0.3)(x)

    # Block 2
    x = L.Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = L.Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = L.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)
    x = L.Dropout(0.3)(x)

    # Block 3
    x = L.Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu')(x)
    x = L.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same')(x)

    # Final Layers
    x = L.GlobalAveragePooling2D()(x)
    x = L.Dense(512, activation='relu')(x)
    outputs = L.Dense(6, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)
    return model

### Train the 1D CNN Model

In [None]:
callbacks = [
    EarlyStopping(monitor= 'val_accuracy', patience=10, restore_best_weights=True, mode ='auto'),
    ReduceLROnPlateau(monitor='val_accuracy',patience=5)
]

In [None]:
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1_padded)
X_val1_scaled = scaler.transform(X_val1_padded)
X_test1_scaled = scaler.transform(X_test1_padded)


In [None]:
# Train 1D CNN
model_1d = build_1d_cnn()
model_1d.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_1d.summary()
# history_1D = model_1d.fit(X_train1_scaled, y_train_arr, epochs=100, batch_size=128, 
#              validation_data=(X_val1_scaled, y_val_arr), callbacks=callbacks)

In [None]:
loaded_model = tf.keras.models.load_model("C:\Users\Kimo Store\Desktop\Term 8\Pattern\Labs\Lab 3\Speech-Emotion-Recognition\model_1d.h5")

### Train the 2D CNN Model

In [None]:
# # Train 2D CNN
model_2d = build_2d_cnn()
model_2d.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_2d.summary()
# history_2D = model_2d.fit(X_train2_padded, y_train_arr, epochs=1, batch_size=32, 
#              validation_data=(X_val2_padded, y_val_arr), callbacks=callbacks)

from tensorflow.keras.utils import Sequence
import numpy as np

class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size=32):
        self.X = X
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        start = index * self.batch_size
        end = start + self.batch_size
        return self.X[start:end], self.y[start:end]

train_gen = DataGenerator(X_train2_padded, y_train_arr)
val_gen = DataGenerator(X_val2_padded, y_val_arr)


history = model_2d.fit(train_gen, validation_data=val_gen, epochs=30, callbacks=callbacks)



### 1D Model Plotting

In [None]:
# train accuracy
train_acc_1d = history_1D.history['accuracy']
# validation accuracy
val_acc_1d = history_1D.history['val_accuracy']
# train loss
train_loss_1d = history_1D.history['loss']
# validation loss
val_loss_1d = history_1D.history['val_loss']

print("Train accuracy 1D:", train_acc_1d)
print("Validation accuracy 1D:", val_acc_1d)

# Plotting
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_acc_1d, label='Train 1D')
plt.plot(val_acc_1d, label='Validation 1D')

plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_loss_1d, label='Train 1D')
plt.plot(val_loss_1d, label='Validation 1D')

plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()


### 2D Model Plotting

In [None]:
# train accuracy
train_acc_2d = history_2D.history['accuracy']
# validation accuracy
val_acc_2d = history_2D.history['val_accuracy']
# train loss
train_loss_2d = history_2D.history['loss']
# validation loss
val_loss_2d = history_2D.history['val_loss']

print("Train accuracy 2D:", train_acc_2d)
print("Validation accuracy 2D:", val_acc_2d)

# Plotting
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_acc_2d, label='Train 2D')
plt.plot(val_acc_2d, label='Validation 2D')

plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_loss_2d, label='Train 2D')
plt.plot(val_loss_2d, label='Validation 2D')

plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()


## Evaluation

In [None]:
# Evaluate the model on the test set
test_loss_1d, test_acc_1d = model_1d.evaluate(X_test1_scaled, y_test_arr)
print(f"Test accuracy 1D: {test_acc_1d:.4f}")

#f1 score
from sklearn.metrics import f1_score
y_pred_1d = model_1d.predict(X_test1_scaled)
y_pred_1d_classes = np.argmax(y_pred_1d, axis=1)
f1_1d = f1_score(y_test, y_pred_1d_classes, average='weighted')
print(f"F1 Score 1D: {f1_1d:.4f}")

# Evaluate the model on the test set
test_loss_2d, test_acc_2d = model_2d.evaluate(X_test2_padded, y_test_arr)
print(f"Test accuracy 2D: {test_acc_2d:.4f}")

#f1 score
y_pred_2d = model_2d.predict(X_test2_padded)
y_pred_2d_classes = np.argmax(y_pred_2d, axis=1)
f1_2d = f1_score(y_test, y_pred_2d_classes, average='weighted')
print(f"F1 Score 2D: {f1_2d:.4f}")

# Save the models
model_1d.save("C:\Users\Kimo Store\Desktop\Term 8\Pattern\Labs\Lab 3\Speech-Emotion-Recognition\model_1d.h5")
model_2d.save("C:\Users\Kimo Store\Desktop\Term 8\Pattern\Labs\Lab 3\Speech-Emotion-Recognition\model_2d.h5")
# Load the models
loaded_model = tf.keras.models.load_model("C:\Users\Kimo Store\Desktop\Term 8\Pattern\Labs\Lab 3\Speech-Emotion-Recognition\model_1d.h5")
loaded_model2 = tf.keras.models.load_model("C:\Users\Kimo Store\Desktop\Term 8\Pattern\Labs\Lab 3\Speech-Emotion-Recognition\model_2d.h5")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrix for 1D model
conf_matrix_1d = confusion_matrix(y_test, y_pred_1d_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_1d, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - 1D Model")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Confusion matrix for 2D model
conf_matrix_2d = confusion_matrix(y_test, y_pred_2d_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_2d, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix - 2D Model")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Find the most confusing classes for 1D model
confusion_1d = conf_matrix_1d.copy()
np.fill_diagonal(confusion_1d, 0)  # Ignore diagonal (correct predictions)
most_confusing_1d = np.unravel_index(np.argmax(confusion_1d), confusion_1d.shape)
print(f"Most confusing classes for 1D model: {le.classes_[most_confusing_1d[0]]} and {le.classes_[most_confusing_1d[1]]}")

# Find the most confusing classes for 2D model
confusion_2d = conf_matrix_2d.copy()
np.fill_diagonal(confusion_2d, 0)  # Ignore diagonal (correct predictions)
most_confusing_2d = np.unravel_index(np.argmax(confusion_2d), confusion_2d.shape)
print(f"Most confusing classes for 2D model: {le.classes_[most_confusing_2d[0]]} and {le.classes_[most_confusing_2d[1]]}")