In [49]:
import os

## Setting up the kaggle environment instead of downloading the dataset

In [50]:
os.environ['KAGGLE_CONFIG_DIR'] = "Female-voice-emotion-detection/"

In [51]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio --unzip -p /content/data

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content/data




  0%|          | 0.00/429M [00:00<?, ?B/s]
  0%|          | 1.00M/429M [00:00<06:51, 1.09MB/s]
  0%|          | 2.00M/429M [00:01<03:39, 2.04MB/s]
  1%|          | 3.00M/429M [00:01<02:22, 3.15MB/s]
  1%|          | 4.00M/429M [00:01<01:51, 3.99MB/s]
  1%|1         | 5.00M/429M [00:01<01:30, 4.89MB/s]
  1%|1         | 6.00M/429M [00:01<01:20, 5.48MB/s]
  2%|1         | 7.00M/429M [00:01<01:19, 5.56MB/s]
  2%|1         | 8.00M/429M [00:02<01:16, 5.80MB/s]
  2%|2         | 9.00M/429M [00:02<01:17, 5.71MB/s]
  2%|2         | 10.0M/429M [00:02<01:11, 6.12MB/s]
  3%|2         | 11.0M/429M [00:02<01:17, 5.64MB/s]
  3%|2         | 12.0M/429M [00:02<01:15, 5.77MB/s]
  3%|3         | 13.0M/429M [00:02<01:15, 5.78MB/s]
  3%|3         | 14.0M/429M [00:03<01:13, 5.93MB/s]
  3%|3         | 15.0M/429M [00:03<01:17, 5.62MB/s]
  4%|3         | 16.0M/429M [00:03<01:10, 6.13MB/s]
  4%|3         | 17.0M/429M [00:03<01:11, 6.03MB/s]
  4%|4         | 18.0M/429M [00:03<01:16, 5.63MB/s]
  4%|4         | 19.

In [52]:
print(os.listdir('/content/data'))

['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24', 'audio_speech_actors_01-24']


In [53]:
import librosa
import numpy as np
import tensorflow
from tensorflow.keras.layers import Conv2D , MaxPooling2D , Flatten , Dense , Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

In [54]:
data_path = '/content/data'

# As per the RAVDESS file description emotion labels are :Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

In [56]:
emotion_dict = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad', 
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}

In [57]:
def parse_file(name):
    parts = name.split('-')
    id = int(parts[6].split('.')[0])
    emotion = parts[2]
    return id , emotion_dict[emotion]

In [58]:
from sklearn.preprocessing import StandardScaler

# Features Extraction

In [59]:
def extract_features_from_female_voices(dataset_path):
    features = []
    labels = []
    for actor_folder in os.listdir(dataset_path):
        actor_folder_path = os.path.join(dataset_path, actor_folder)
        if not os.path.isdir(actor_folder_path):
            continue
        if actor_folder.startswith('Actor_'):
            actor_id = int(actor_folder.split('_')[1])  
            if actor_id % 2 == 0:
                print(actor_id)
                for filename in os.listdir(actor_folder_path):
                    if filename.endswith('.wav'):
                        actor_id, emotion = parse_file(filename)
                        file_path = os.path.join(actor_folder_path, filename)
                        
                        y, sr = librosa.load(file_path, duration=4, sr=22050)
                        
                        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
                        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
                        mel = librosa.feature.melspectrogram(y=y, sr=sr)
                        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
                        mfcc = np.mean(mfcc.T, axis=0)  
                        chroma = np.mean(chroma.T , axis = 0)
                        mel = np.mean(mel.T , axis = 0)
                        contrast = np.mean(contrast.T , axis = 0)

                        feature_vector = np.hstack([mfcc,chroma,mel,contrast])
                        features.append(feature_vector)
                        labels.append(emotion)
    
    return np.array(features), np.array(labels)

In [60]:
features , labels = extract_features_from_female_voices(data_path)

2
4
6
8
10
12
14
16
18
20
22
24


In [61]:
features

array([[-650.71087646,   54.47730255,   -9.09012604, ...,   16.9506885 ,
          17.00934124,   41.89358711],
       [-646.01690674,   49.11970901,  -10.80037117, ...,   16.23188555,
          16.13237261,   43.3635453 ],
       [-622.57867432,   51.31145096,  -10.43953609, ...,   16.27188895,
          16.62613895,   44.1792086 ],
       ...,
       [-541.25164795,   29.59570885,  -18.00136185, ...,   17.01178966,
          18.31539014,   41.85908053],
       [-492.65252686,   23.88798141,   -6.02665901, ...,   17.71937313,
          17.30182477,   42.74130653],
       [-517.98480225,   29.57121468,   -3.90969872, ...,   16.59385466,
          16.62018105,   43.96294669]])

In [62]:
labels

array(['Neutral', 'Neutral', 'Neutral', 'Neutral', 'Calm', 'Calm', 'Calm',
       'Calm', 'Calm', 'Calm', 'Calm', 'Calm', 'Happy', 'Happy', 'Happy',
       'Happy', 'Happy', 'Happy', 'Happy', 'Happy', 'Sad', 'Sad', 'Sad',
       'Sad', 'Sad', 'Sad', 'Sad', 'Sad', 'Angry', 'Angry', 'Angry',
       'Angry', 'Angry', 'Angry', 'Angry', 'Angry', 'Fearful', 'Fearful',
       'Fearful', 'Fearful', 'Fearful', 'Fearful', 'Fearful', 'Fearful',
       'Disgust', 'Disgust', 'Disgust', 'Disgust', 'Disgust', 'Disgust',
       'Disgust', 'Disgust', 'Surprised', 'Surprised', 'Surprised',
       'Surprised', 'Surprised', 'Surprised', 'Surprised', 'Surprised',
       'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Calm', 'Calm', 'Calm',
       'Calm', 'Calm', 'Calm', 'Calm', 'Calm', 'Happy', 'Happy', 'Happy',
       'Happy', 'Happy', 'Happy', 'Happy', 'Happy', 'Sad', 'Sad', 'Sad',
       'Sad', 'Sad', 'Sad', 'Sad', 'Sad', 'Angry', 'Angry', 'Angry',
       'Angry', 'Angry', 'Angry', 'Angry', 'Angry', 'Fearf

In [63]:
len(labels)

720

In [64]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding

In [65]:
features = features / np.max(np.abs(features),axis = 0)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

labels = to_categorical(labels , num_classes=len(emotion_dict))

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.35, random_state=42)

In [66]:
len(x_train) , len(y_train)

(468, 468)

In [67]:
x_train

array([[-0.74846149,  0.63391488, -0.12385911, ...,  0.79237673,
         0.7360514 ,  0.82515261],
       [-0.44965979,  0.44088529, -0.37404054, ...,  0.88947483,
         0.83315649,  0.93207194],
       [-0.63672172,  0.65137731, -0.00823582, ...,  0.88781303,
         0.89423918,  0.92968471],
       ...,
       [-0.58520309,  0.78394482, -0.16630727, ...,  0.88343049,
         0.85672931,  0.91571999],
       [-0.68611914,  0.48068543, -0.14929527, ...,  0.78981119,
         0.77086211,  0.8303982 ],
       [-0.51148154,  0.22304579, -0.31014749, ...,  0.88687954,
         0.81446945,  0.83250822]])

In [68]:
y_train

array([[0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [69]:
x_test

array([[-0.51162622,  0.43021147, -0.37243392, ...,  0.85230086,
         0.79384707,  0.92802111],
       [-0.56659539,  0.70226458, -0.36685081, ...,  0.9207377 ,
         0.92009064,  0.93537149],
       [-0.70644639,  0.5097556 , -0.25080615, ...,  0.77139598,
         0.75888865,  0.83819544],
       ...,
       [-0.79818156,  0.56730298, -0.10348944, ...,  0.84577838,
         0.75459974,  0.87491875],
       [-0.69886728,  0.59116288, -0.2783929 , ...,  0.79250724,
         0.74405374,  0.94517427],
       [-0.6896567 ,  0.43602955, -0.14703336, ...,  0.83449086,
         0.78264861,  0.82826704]])

In [70]:
y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

# Reshaping

In [71]:
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1, 1)
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1, 1)

In [72]:
x_train

array([[[[-0.74846149]],

        [[ 0.63391488]],

        [[-0.12385911]],

        ...,

        [[ 0.79237673]],

        [[ 0.7360514 ]],

        [[ 0.82515261]]],


       [[[-0.44965979]],

        [[ 0.44088529]],

        [[-0.37404054]],

        ...,

        [[ 0.88947483]],

        [[ 0.83315649]],

        [[ 0.93207194]]],


       [[[-0.63672172]],

        [[ 0.65137731]],

        [[-0.00823582]],

        ...,

        [[ 0.88781303]],

        [[ 0.89423918]],

        [[ 0.92968471]]],


       ...,


       [[[-0.58520309]],

        [[ 0.78394482]],

        [[-0.16630727]],

        ...,

        [[ 0.88343049]],

        [[ 0.85672931]],

        [[ 0.91571999]]],


       [[[-0.68611914]],

        [[ 0.48068543]],

        [[-0.14929527]],

        ...,

        [[ 0.78981119]],

        [[ 0.77086211]],

        [[ 0.8303982 ]]],


       [[[-0.51148154]],

        [[ 0.22304579]],

        [[-0.31014749]],

        ...,

        [[ 0.88687954]],

        

In [73]:
from tensorflow.keras.layers import BatchNormalization , Dropout , Activation

In [74]:
import tensorflow 

# CNN Model

In [75]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Conv2D, Activation, GlobalAveragePooling2D ,AveragePooling2D, Flatten, Dense, Dropout

def build_cnn(input_shape):
    model = Sequential()

    model.add(Conv2D(32, (3, 3), input_shape=input_shape, padding='same',activation='relu'))
    model.add(AveragePooling2D(pool_size=(1, 1)))
    
    model.add(Conv2D(64, (3, 3), padding='same',activation='relu'))
    model.add(AveragePooling2D(pool_size=(1, 1))) 
    
    model.add(Conv2D(128, (3, 3), padding='same',activation='relu'))
    model.add(AveragePooling2D(pool_size=(1, 1)))  

    model.add(Conv2D(256, (3, 3), padding='same',activation='relu'))
    model.add(AveragePooling2D(pool_size=(1, 1))) 

    model.add(Conv2D(512, (3, 3), padding='same',activation='relu'))
    model.add(AveragePooling2D(pool_size=(1, 1)))  
    
    model.add(Flatten())
    
    model.add(Dense(1024, activation='relu',kernel_regularizer = l2(0.001)))
    model.add(Dropout(0.50))
              
    model.add(Dense(8, activation='softmax'))  

    model.compile(optimizer=Adam(learning_rate = 0.0003), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [76]:
model = build_cnn((x_train.shape[1],1,1))
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [77]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [78]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Model Checkpoint

In [None]:
checkpoint = ModelCheckpoint("model_weights.weights.h5",
                       monitor = 'val_accuracy',
                       save_weights_only=True,
                            mode= 'max')
lr_scheduler = ReduceLROnPlateau(monitor = 'val_loss' , factor = 0.5 , patience=3 , verbose =1)
callback = [checkpoint , lr_scheduler]

In [None]:
history = model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test),callbacks=callback)

# Final Enhanced Model

In [98]:
from tensorflow.keras.models import model_from_json

with open("Voice_detection_model11_feb.json", "r") as json_file:
    json_model = json_file.read()
loaded_model = model_from_json(json_model)
loaded_model.load_weights("model_11feb_weights.weights.h5")
loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("Model Loaded Successfully")

Model Loaded Successfully


In [100]:
loaded_model.evaluate(x = x_test , y = y_test)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 290ms/step - accuracy: 0.7409 - loss: 1.3878


[1.5716071128845215, 0.7023809552192688]

In [102]:
y_pred = loaded_model.predict(x_test)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 392ms/step


In [103]:
from sklearn.metrics import confusion_matrix , accuracy_score
import seaborn as sns

In [106]:
y_pred_labels = (y_pred > 0.5).astype(int)

In [108]:
y_pred_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [110]:
from sklearn.metrics import multilabel_confusion_matrix
cm = multilabel_confusion_matrix(y_test, y_pred_labels)

In [112]:
accuracy = accuracy_score(y_test , y_pred_labels)
print("Accuracy:",accuracy*100,"%")

Accuracy: 67.85714285714286 %


In [114]:
cm[0]

array([[205,   9],
       [ 12,  26]], dtype=int64)

In [115]:

cm[1]

array([[216,   9],
       [  5,  22]], dtype=int64)

In [118]:
cm[7]

array([[204,  10],
       [ 10,  28]], dtype=int64)

In [120]:
loaded_model.save("Final_detection_model.h5")



# GUI

In [None]:
import tensorflow 
from tensorflow.keras.models import model_from_json

In [None]:
!pip install pyaudio

In [None]:
!pip install sounddevice

In [None]:
import sounddevice as sd

In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
import numpy as np
import librosa
import sounddevice as sd
import wave
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from sklearn.preprocessing import LabelEncoder

# Load the trained model
with open("Voice_detection_model11_feb.json", "r") as json_file:
    json_model = json_file.read()
loaded_model = model_from_json(json_model)
loaded_model.load_weights("model_11feb_weights.weights.h5")
loaded_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("Model Loaded Successfully")

emotion_dict = {
    '01': 'Neutral', '02': 'Calm', '03': 'Happy', '04': 'Sad', 
    '05': 'Angry', '06': 'Fearful', '07': 'Disgust', '08': 'Surprised'
}

label_encoder = LabelEncoder()
label_encoder.fit(list(emotion_dict.values()))

def detect_gender(file_path):
    try:
        y , sr = librosa.load(file_path , sr = 22050)
        f0,_,_ = librosa.pyin(y , fmin = 85 , fmax = 300)
        f0 = f0[~np.isnan(f0)]
        if len(f0)==0:
            return "Unknown"
        avg_pitch = np.mean(f0)
        return "Female" if avg_pitch>165 else "Male"
    except Exception as e:
        print(e)
        return "Unknown"

def predict_emotion(file_path):
    gender = detect_gender(file_path)
    if gender == "Unknown":
        messagebox.showerror("Error")
        return "Invalid"
    y, sr = librosa.load(file_path, duration=4, sr=22050)
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr) 
    mfcc = np.mean(mfcc.T, axis=0)
    chroma = np.mean(chroma.T, axis=0)
    mel = np.mean(mel.T, axis=0)
    contrast = np.mean(contrast.T, axis=0)

    features = np.hstack([mfcc, chroma, mel, contrast])
    
    features = features.reshape(1, features.shape[0], 1, 1)

    prediction = loaded_model.predict(features)
    predicted_index = np.argmax(prediction) + 1  # Ensure it starts from '01'
    
    predicted_emotion = emotion_dict.get(f"{predicted_index:02d}", "Unknown")
    return gender , predicted_emotion

def upload_audio():
    file_path = filedialog.askopenfilename(filetypes=[("WAV files", "*.wav")])
    if file_path:
        gender , emotion = predict_emotion(file_path)
        if emotion != "Invalid":
            result_label.config(text=f"Gender:{gender}\nEmotion Detected: {emotion}", fg="green")

# Function to record audio
def record_audio():
    fs = 22050  # Sampling frequency
    seconds = 4  # Duration of recording
    messagebox.showinfo("Recording", "Recording for 4 seconds. Speak now...")

    try:
        recording = sd.rec(int(seconds * fs), samplerate=fs, channels=1, dtype=np.int16)
        sd.wait()
    
        file_path = "recorded_audio.wav"
        
        # Save recorded audio using the built-in wave module
        with wave.open(file_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)  # 16-bit audio
            wf.setframerate(fs)
            wf.writeframes(recording.tobytes())
    
        gender , emotion = predict_emotion(file_path)
        if emotion != "Invalid":
            result_label.config(text=f"Gender:{gender}\n Emotion Detected: {emotion}", fg="green")
    except Exception as e:
        messagebox.showerror("Error",f"Recording Failed: {str(e)}")
root = tk.Tk()
root.title("Emotion Detection from Female Voice")
root.geometry("400x300")

upload_button = tk.Button(root, text="Upload Voice Note", command=upload_audio, width=25 , bg="#4CAF50",fg = "white")
upload_button.pack(pady=10)

record_button = tk.Button(root, text="Record Voice", command=record_audio, width=25,bg = "#008CBA",fg = "white")
record_button.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 12))
result_label.pack(pady=20)

root.mainloop()
