# Voice Emotion Detection Model

### Importing Libraries

In [20]:
import numpy as np
import pyaudio
import wave
import librosa
import tkinter as tk
from tkinter import filedialog
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential, load_model
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import np_utils
import os
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import tensorflow as tf

### Function to extract audio features

In [8]:
def extract_audio_features(audio_path, n_mfcc=13):
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load M4A file
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        # Concatenate features
        features = np.concatenate((np.mean(mfccs, axis=1), 
                                    np.mean(chroma, axis=1), 
                                    np.mean(contrast, axis=1)))
        return features
    except Exception as e:
        print(f"Error extracting features from {audio_path}: {e}")
        return None

### Function to load audio features

In [10]:
def load_audio_features(dataset_path):
    features = []
    labels = []
    for label in ['males', 'females']:  # Assuming folders are named 'males' and 'females'
        folder_path = os.path.join(dataset_path, label)
        for file in os.listdir(folder_path):
            if file.endswith('.m4a'):  # Update to .m4a file extension
                file_path = os.path.join(folder_path, file)
                feature = extract_audio_features(file_path)
                if feature is not None:
                    features.append(feature)
                    labels.append(label)
    return np.array(features), np.array(labels)

dataset_path = 'VoxCeleb_gender/'  # Update with your dataset path
X_gender, y_gender = load_audio_features(dataset_path)

# Check if features are correctly extracted
if X_gender.size == 0:
    raise ValueError("No features extracted. Please check the dataset and feature extraction process.")

  y, sr = librosa.load(audio_path, sr=None)  # Load M4A file


### Loading the dataset for Gender detection

In [66]:
from tensorflow.keras.utils import to_categorical

scaler_gender = StandardScaler()
X_gender_scaled = scaler_gender.fit_transform(X_gender)

# Encode labels
lb_gender = LabelEncoder()
y_gender_encoded = lb_gender.fit_transform(y_gender)
y_gender_encoded = to_categorical(y_gender_encoded)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_gender_scaled, y_gender_encoded, test_size=0.2, random_state=42)

### Defining, Training and Saving the model for gender detection

In [68]:
# Model Definition
input_shape = (X_gender_scaled.shape[1], 1)

gender_model = Sequential()
gender_model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
gender_model.add(MaxPooling1D(pool_size=2))
gender_model.add(Conv1D(128, kernel_size=3, activation='relu'))
gender_model.add(MaxPooling1D(pool_size=2))
gender_model.add(Flatten())
gender_model.add(Dense(256, activation='relu'))
gender_model.add(Dropout(0.5))
gender_model.add(Dense(y_gender_encoded.shape[1], activation='softmax'))

# Compile the model
gender_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape data for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Train the model
gender_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Save the model and scaler
gender_model.save_weights('gender_detection_model.weights.h5')
np.save('gender_scaler_mean.npy', scaler_gender.mean_)
np.save('gender_scaler_var.npy', scaler_gender.var_)
np.save('gender_scaler_std.npy', scaler_gender.scale_)

Epoch 1/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7669 - loss: 0.4855 - val_accuracy: 0.8590 - val_loss: 0.3477
Epoch 2/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8711 - loss: 0.3250 - val_accuracy: 0.8599 - val_loss: 0.3315
Epoch 3/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8661 - loss: 0.3136 - val_accuracy: 0.8791 - val_loss: 0.3036
Epoch 4/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8869 - loss: 0.2960 - val_accuracy: 0.8741 - val_loss: 0.3138
Epoch 5/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8979 - loss: 0.2621 - val_accuracy: 0.8799 - val_loss: 0.3105
Epoch 6/50
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8978 - loss: 0.2628 - val_accuracy: 0.8774 - val_loss: 0.3105
Epoch 7/50
[1m150/150[0m 

In [63]:
gender_model.save('gender_detection_model.keras')
model_json = gender_model.to_json()
with open('gender_detection_model.json', 'w') as json_file:
    json_file.write(model_json)

In [39]:
model_json = gender_model.to_json()
with open('gender_detection_model.json', 'w') as json_file:
    json_file.write(model_json)

In [44]:
gender_model.save('gender_detection_model.h5')



### Function to extract features

In [72]:
def extract_features_from_audio(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs, axis=1)
        return mfccs_mean
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

### Function to load data

In [73]:
def load_data_from_directory(directory_path):
    emotion_map = {
        'SAD': 'sadness',
        'ANG': 'angry',
        'DIS': 'disgust',
        'FEA': 'fear',
        'HAP': 'happy',
        'NEU': 'neutral'
    }
    features = []
    labels = []
    for root, _, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith('.wav'):
                file_path = os.path.join(root, file_name)
                print(f"Processing file: {file_path}")
                feature = extract_features_from_audio(file_path)
                if feature is not None:
                    emotion_code = file_name.split('_')[2]
                    emotion_label = emotion_map.get(emotion_code)
                    if emotion_label:
                        features.append(feature)
                        labels.append(emotion_label)
                    else:
                        print(f"Unknown emotion code {emotion_code} in file {file_name}")
    return np.array(features), np.array(labels)


### Loading the dataset for emotion detection

In [74]:
dataset_path = 'Crema'  # Path to your dataset
X, y = load_data_from_directory(dataset_path)

# Check if data is loaded
if X.size == 0 or y.size == 0:
    raise ValueError("No data loaded. Please check the dataset path and structure.")

print(f"Loaded {len(X)} samples with {len(y)} labels.")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded = to_categorical(y_encoded)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Reshape data for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1, 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1, 1)


Processing file: Crema\1001_DFA_ANG_XX.wav
Processing file: Crema\1001_DFA_DIS_XX.wav
Processing file: Crema\1001_DFA_FEA_XX.wav
Processing file: Crema\1001_DFA_HAP_XX.wav
Processing file: Crema\1001_DFA_NEU_XX.wav
Processing file: Crema\1001_DFA_SAD_XX.wav
Processing file: Crema\1001_IEO_ANG_HI.wav
Processing file: Crema\1001_IEO_ANG_LO.wav
Processing file: Crema\1001_IEO_ANG_MD.wav
Processing file: Crema\1001_IEO_DIS_HI.wav
Processing file: Crema\1001_IEO_DIS_LO.wav
Processing file: Crema\1001_IEO_DIS_MD.wav
Processing file: Crema\1001_IEO_FEA_HI.wav
Processing file: Crema\1001_IEO_FEA_LO.wav
Processing file: Crema\1001_IEO_FEA_MD.wav
Processing file: Crema\1001_IEO_HAP_HI.wav
Processing file: Crema\1001_IEO_HAP_LO.wav
Processing file: Crema\1001_IEO_HAP_MD.wav
Processing file: Crema\1001_IEO_NEU_XX.wav
Processing file: Crema\1001_IEO_SAD_HI.wav
Processing file: Crema\1001_IEO_SAD_LO.wav
Processing file: Crema\1001_IEO_SAD_MD.wav
Processing file: Crema\1001_IOM_ANG_XX.wav
Processing 

### Defining the model

In [75]:
emotion_model = Sequential()
emotion_model.add(Conv2D(32, (3, 1), activation='relu', input_shape=(X_train.shape[1], 1, 1)))
emotion_model.add(MaxPooling2D(pool_size=(2, 1)))
emotion_model.add(Conv2D(64, (3, 1), activation='relu'))
emotion_model.add(MaxPooling2D(pool_size=(2, 1)))
emotion_model.add(Flatten())
emotion_model.add(Dense(128, activation='relu'))
emotion_model.add(Dropout(0.5))
emotion_model.add(Dense(y_encoded.shape[1], activation='softmax'))

emotion_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### Training and Saving the model

In [76]:
history = emotion_model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Save the model
emotion_model.save('emotion_detection_model.h5')

Epoch 1/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.2884 - loss: 1.6650 - val_accuracy: 0.3788 - val_loss: 1.5157
Epoch 2/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.3408 - loss: 1.5580 - val_accuracy: 0.4009 - val_loss: 1.4836
Epoch 3/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3675 - loss: 1.5102 - val_accuracy: 0.4009 - val_loss: 1.4624
Epoch 4/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3942 - loss: 1.4761 - val_accuracy: 0.4009 - val_loss: 1.4700
Epoch 5/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3975 - loss: 1.4687 - val_accuracy: 0.4224 - val_loss: 1.4402
Epoch 6/30
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4046 - loss: 1.4361 - val_accuracy: 0.4144 - val_loss: 1.4373
Epoch 7/30
[1m187/187[0m 



In [37]:
emotion_model.save('emotion_detection_model.keras')

In [38]:
model_json = emotion_model.to_json()
with open('emotion_detection_model.json', 'w') as json_file:
    json_file.write(model_json)

In [45]:
emotion_model.save_weights('emotion_detection_model.weights.h5')

In [46]:
emotion_model.save('emotion_detection_model.h5')

