## Voice Gender Classification

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import librosa


In [7]:
# Load audio file and extract features
def load_and_extract_features(*audio_paths, max_length=None):
    mfccs_list = []
    pitch_list = []
    formants_list = []

    for audio_path in audio_paths:
        y, sr = librosa.load(audio_path)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

        # Pad or truncate MFCCs to the maximum length
        if max_length is not None:
            if mfccs.shape[1] < max_length:
                mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
            elif mfccs.shape[1] > max_length:
                mfccs = mfccs[:, :max_length]

        pitch = np.mean(librosa.yin(y, fmin=50, fmax=200))
        formants = np.mean(librosa.effects.harmonic(y))

        mfccs_list.append(mfccs)
        pitch_list.append(pitch)
        formants_list.append(formants)

    return np.array(mfccs_list), np.array(pitch_list), np.array(formants_list)


# max_length = max(X_human_male_mfccs.shape[1], X_human_female_mfccs.shape[1], X_ai_male_mfccs.shape[1], X_ai_female_mfccs.shape[1])





In [9]:
# Define audio paths
human_male_audio_paths = [
    'Human-voice2-m.mp3',
    'Human-voice3-m.mp3',
    'Human-voice4-m.mp3',
    'Human-voice5-m.mp3',
    'Human-4-m.mp3',
    'Human-5-m.mp3',
    'Human-6-m.mp3',
    'Human-7-m.mp3'
]

human_female_audio_paths = [
    'Human-voice6-f.mp3',
    'Human-voice1-f.mp3',
    'Human-voice (copy)-f.mp3',
    # 'Human-1-f.aac',
    'Human-2-f.mp3',
    'Human-3-f.mp3',
    'Human-8-f.mp3'
]

ai_male_audio_paths = [
    'synthesize-m.mp3',
    'synthesize-1-m.mp3',
    'synthesize-2-m.mp3',
    'synthesize-3-m.mp3',
    'synthesize-4-m.mp3',
    'synthesize-5-m.mp3',
    'synthesize-6-m.mp3',
    'synthesize-7-m.mp3',
    'synthesize-8-m.mp3'
]

ai_female_audio_paths = [
    'synthesize-f.mp3',
    'synthesize-1-f.mp3',
    'synthesize-2-f.mp3',
    'synthesize-3-f.mp3',
    'synthesize-4-f.mp3',
    'synthesize-5-f.mp3',
    'synthesize-6-f.mp3',
    'synthesize-7-f.mp3',
    'synthesize-8-f.mp3'
]

max_length = 500
# Load and extract features from audio files with padding/truncation
X_human_male_mfccs, _, _ = load_and_extract_features(*human_male_audio_paths, max_length=max_length)
X_human_female_mfccs, _, _ = load_and_extract_features(*human_female_audio_paths, max_length=max_length)
X_ai_male_mfccs, _, _ = load_and_extract_features(*ai_male_audio_paths, max_length=max_length)
X_ai_female_mfccs, _, _ = load_and_extract_features(*ai_female_audio_paths, max_length=max_length)



# Create labels
y_human_male = np.zeros(len(human_male_audio_paths))  # Human male voice: 0
y_human_female = np.ones(len(human_female_audio_paths))  # Human female voice: 1
y_ai_male = np.ones(len(ai_male_audio_paths)) * 2  # AI male voice: 2
y_ai_female = np.ones(len(ai_female_audio_paths)) * 3  # AI female voice: 3

# Combine features and labels
X = np.concatenate([X_human_male_mfccs, X_human_female_mfccs, X_ai_male_mfccs, X_ai_female_mfccs])
y = np.concatenate([y_human_male, y_human_female, y_ai_male, y_ai_female])

# Shuffle data
random_indices = np.random.permutation(len(X))
X = X[random_indices]
y = y[random_indices]

# Normalize MFCCs
X_mfccs_normalized = (X - np.mean(X)) / np.std(X)

# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_mfccs_normalized, y, test_size=0.2, random_state=42)


In [10]:
# Define deep neural network model
model1 = models.Sequential([
    layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
    layers.Conv1D(64, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Conv1D(128, kernel_size=3, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='softmax')  # 4 output classes
])

# Compile model
model1.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model1.fit(X_mfccs_normalized, y, epochs=20, batch_size=32,) #validation_data=(X_test, y_test))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1e6956fe7d0>

In [15]:
# Load and extract features from a single audio file
def load_and_extract_feature(audio_path, max_length=None):
    y, sr = librosa.load(audio_path)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Pad or truncate MFCCs to the maximum length
    if max_length is not None:
        if mfccs.shape[1] < max_length:
            mfccs = np.pad(mfccs, ((0, 0), (0, max_length - mfccs.shape[1])), mode='constant')
        elif mfccs.shape[1] > max_length:
            mfccs = mfccs[:, :max_length]

    # Expand dimensions to match the expected input shape of the model
    mfccs = np.expand_dims(mfccs, axis=0)  # Add batch dimension
    return mfccs

# Load and extract features from a single audio file
audio_path = 'Human-4-m.mp3'
max_length = 500  # Set the maximum length of MFCCs
X_single_audio = load_and_extract_feature(audio_path, max_length=max_length)

# Make prediction using the trained model
predicted_label = model1.predict(X_single_audio)

# Decode the predicted label (optional)
class_names = ['human male', 'human female', 'robot male', 'robot female']
predicted_class_index = np.argmax(predicted_label)
predicted_class = class_names[predicted_class_index]

print("Predicted class:", predicted_class)



Predicted class: human male
