In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
import librosa as lb
import pandas as pd
import librosa.display
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelBinarizer, StandardScaler


In [2]:
def extract_features(main_dir, list_of_features):
    features_list = []
    labels = []

    for subfolder in os.listdir(main_dir):
        subfolder_path = os.path.join(main_dir, subfolder)
        if os.path.isdir(subfolder_path):
            # Iterate over each audio file in the subfolder
            for filename in os.listdir(subfolder_path):
                if filename.endswith('.wav'):
                    file_path = os.path.join(subfolder_path, filename)
                    
                    y, sr = librosa.load(file_path)

                    features = []
                    if 'mfcc' in list_of_features:
                        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
                        features.extend(mfcc.mean(axis=1))

                    if 'chroma' in list_of_features:
                        stft = np.abs(librosa.stft(y))
                        chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
                        features.extend(chroma.mean(axis=1))

                    if 'melspectrogram' in list_of_features:
                        mel_spect = librosa.feature.melspectrogram(y=y, sr=sr)
                        mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
                        features.extend(mel_spect_db.mean(axis=1))

                    features_list.append(features)
                    labels.append(filename[7])  

    return features_list, labels


In [5]:
def load_data(test_size1=0.2):
    filename = r'D:\IITB work\SoC-Speech Emotion Recognition\week 5&6\Submission\Audio_Speech_Actors_01-24'
    filename1 = r'D:\IITB work\SoC-Speech Emotion Recognition\week 5&6\Submission\Audio_Song_Actors_01-24'
    listf = ['mfcc', 'chroma', 'melspectrogram']
    
    features, labels = extract_features(main_dir=filename, list_of_features=listf)
    features1, labels1 = extract_features(main_dir=filename1, list_of_features=listf)
    
    features_array = np.array(features, dtype=np.float32)
    labels_array = np.array(labels)
    features_array1 = np.array(features1, dtype=np.float32)
    labels_array1 = np.array(labels1)
    
    features_array2 = np.concatenate((features_array, features_array1), axis=0)
    labels_array2 = np.concatenate((labels_array, labels_array1), axis=0)
    
    scaler = StandardScaler()
    features_array2 = scaler.fit_transform(features_array2)
    
    lb = LabelBinarizer()
    labels_one_hot = lb.fit_transform(labels_array2)
    
    X_train, X_test, y_train, y_test = train_test_split(features_array2, labels_one_hot, test_size=test_size1, random_state=42)
    return X_train, X_test, y_train, y_test, lb



In [11]:
X_train, X_test, y_train, y_test, lb = load_data()

In [12]:
model = tf.keras.Sequential([
    Dense(300, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(y_train.shape[1], activation='softmax')
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])  
model.fit(X_train, y_train, epochs=200, batch_size=256, validation_split=0.1)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_accuracy*100:.4f}')

Epoch 1/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.1732 - loss: 2.1734 - val_accuracy: 0.2893 - val_loss: 1.7548
Epoch 2/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3576 - loss: 1.6753 - val_accuracy: 0.4010 - val_loss: 1.6167
Epoch 3/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4786 - loss: 1.4479 - val_accuracy: 0.4569 - val_loss: 1.4915
Epoch 4/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5354 - loss: 1.3151 - val_accuracy: 0.5025 - val_loss: 1.3990
Epoch 5/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5912 - loss: 1.2186 - val_accuracy: 0.5431 - val_loss: 1.3451
Epoch 6/200
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6154 - loss: 1.1581 - val_accuracy: 0.5990 - val_loss: 1.2862
Epoch 7/200
[1m7/7[0m [32m━━━━━━━━━━━━

In [14]:
print(f'Test Accuracy: {test_accuracy*100:.4f}')

Test Accuracy: 74.5418


In [15]:
y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
print(classification_report(y_test_labels, y_pred_labels))

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
              precision    recall  f1-score   support

           0       0.88      0.74      0.80        38
           1       0.78      0.81      0.80        81
           2       0.77      0.68      0.72        73
           3       0.68      0.76      0.72        71
           4       0.81      0.83      0.82        69
           5       0.71      0.71      0.71        80
           6       0.64      0.60      0.62        45
           7       0.71      0.79      0.75        34

    accuracy                           0.75       491
   macro avg       0.75      0.74      0.74       491
weighted avg       0.75      0.75      0.75       491

