In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import keras
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
folder_path = '/content/drive/MyDrive/IML 2 Project (Voice Recordings)'

audio_files = []

for file_name in os.listdir(folder_path):
    if file_name.endswith('.wav'):
        file_path = os.path.join(folder_path, file_name)
        audio_files.append(file_path)

In [None]:
y, sr = librosa.load('/content/drive/MyDrive/a4p/Trimmed audios/All/fe1.wav', sr=1)

In [None]:
y.shape

(10,)

In [None]:
mfccs = librosa.feature.mfcc(y=y, sr=1, n_mfcc=13)



In [None]:
mfccs

array([[-294.14792  ],
       [  53.043053 ],
       [  44.900948 ],
       [  24.14862  ],
       [  13.405695 ],
       [  15.364409 ],
       [   9.405208 ],
       [   9.9018135],
       [  10.076796 ],
       [ -18.671661 ],
       [ -32.85804  ],
       [   1.4159778],
       [  10.260677 ]], dtype=float32)

In [None]:
y, sr = librosa.load('/content/drive/MyDrive/a4p/Trimmed audios/All/male5.wav', sr=1)
y.shape
mfccs = librosa.feature.mfcc(y=y, sr=1, n_mfcc=13)

array([[-552.67444  ],
       [  37.877853 ],
       [  42.11888  ],
       [  33.972244 ],
       [  15.494717 ],
       [   1.4325881],
       [  18.784405 ],
       [ -14.036968 ],
       [   4.8086863],
       [ -24.701153 ],
       [  -3.3465242],
       [   7.4170785],
       [   6.859656 ]], dtype=float32)

In [None]:
def extract_mfcc_fixed_length(file_path, max_length=216):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    if mfccs.shape[1] < max_length:
        pad_width = max_length - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), 'constant')
    elif mfccs.shape[1] > max_length:
        mfccs = mfccs[:, :max_length]
    return mfccs

data = []
labels = []
for file in audio_files:
    mfccs = extract_mfcc_fixed_length(file)
    data.append(mfccs.T)
    labels.append('male' if 'male' in file else 'female')

data = np.array(data)
labels = np.array(labels)

le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

data = data.reshape(data.shape[0], data.shape[1], data.shape[2], 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, labels_categorical, test_size=0.2, random_state=42)

model = Sequential()

model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(data.shape[1], data.shape[2], 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=1, epochs=15, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy: 0.7272727489471436


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

predictions = model.predict(X_test)

predicted_classes = np.argmax(predictions, axis=1)

true_classes = np.argmax(y_test, axis=1)

print("Classification Report:")
print(classification_report(true_classes, predicted_classes))

print("Confusion Matrix:")
print(confusion_matrix(true_classes, predicted_classes))

Test Loss: 0.3784036338329315
Test Accuracy: 0.7272727489471436
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.50      0.57         4
           1       0.75      0.86      0.80         7

    accuracy                           0.73        11
   macro avg       0.71      0.68      0.69        11
weighted avg       0.72      0.73      0.72        11

Confusion Matrix:
[[2 2]
 [1 6]]


In [None]:
model.save('/content/drive/MyDrive/IML 2 Project (Voice Recordings)/my_model.h5')

In [13]:
df = pd.read_csv('audio mfcc data.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,Gender
0,-423.45193,58.089066,-24.702543,20.479845,-9.324167,1.830143,-15.952717,-16.05199,-17.06708,-9.57866,-3.260925,-7.075751,-6.939365,M
1,-314.5662,130.81305,-52.916336,30.596134,8.158245,-25.563534,-11.247474,-13.634462,-19.344921,-17.006086,-15.362623,-9.323466,-9.167624,M
2,-326.58572,150.42308,-14.021748,13.780667,1.91285,4.059244,-13.950055,-5.525473,-21.506798,-8.975996,-8.065133,-12.770766,-0.622452,M
3,-391.47177,122.89488,31.13436,20.492306,4.017022,7.269419,-14.926802,2.433048,-23.58822,-7.833317,-12.523981,-9.101804,-7.785457,M
4,-284.01328,130.51633,-5.334153,9.015629,2.21399,-10.544295,-12.102945,-10.563928,-13.953804,-12.749711,-10.826887,-6.201292,-4.782575,M


In [15]:
df.shape

(51, 14)

In [78]:
mfcc_data = df.drop(['Gender'],axis=1).values
reshaped_data = mfcc_data.reshape(51, 13, 1)

In [34]:
mfcclabels = df['Gender'].values

In [33]:
reshaped_data[0]

array([[-423.45193  ],
       [  58.089066 ],
       [ -24.702543 ],
       [  20.479845 ],
       [  -9.324167 ],
       [   1.8301427],
       [ -15.952717 ],
       [ -16.05199  ],
       [ -17.06708  ],
       [  -9.57866  ],
       [  -3.260925 ],
       [  -7.075751 ],
       [  -6.939365 ]])

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Load the data
df = pd.read_csv('audio mfcc data.csv')
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})

# Separate features and labels
mfcc_data = df.drop(['Gender'], axis=1).values
mfcclabels = df['Gender'].values

# Reshape MFCC data
reshaped_data = mfcc_data.reshape(mfcc_data.shape[0], mfcc_data.shape[1], 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reshaped_data, mfcclabels, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(reshaped_data.shape[1], reshaped_data.shape[2])))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 neuron for binary classification

# Compile the model
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=1, epochs=15, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15