In [2]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [3]:
import os
import numpy as np
import librosa
from pydub import AudioSegment
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from google.colab import drive


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Define function to convert m4a audio files to wav format
def convert_to_wav(input_path, output_path):
    audio = AudioSegment.from_file(input_path)
    audio.export(output_path, format="wav")

In [6]:
# Define function to load audio files and handle background noise
def load_audio(file_path):
    audio_data, _ = librosa.load(file_path, sr=16000)  # Resample to 16 kHz
    return audio_data

In [7]:
# Define function to extract MFCC features from audio data
def extract_mfcc(audio_data):
    mfccs = librosa.feature.mfcc(y=audio_data, sr=16000, n_mfcc=40)
    mfccs_mean = np.mean(mfccs.T, axis=0)  # Take average of MFCCs over time
    return mfccs_mean

In [8]:
def preprocess_audio(audio_dir):
    X = []
    y = []
    for label, sub_dir in enumerate(["females", "males"]):
        sub_dir_path = os.path.join(audio_dir, sub_dir)
        for file_name in os.listdir(sub_dir_path):
            file_path = os.path.join(sub_dir_path, file_name)
            if file_path.endswith('.m4a'):
                # Convert m4a files to wav format
                wav_file_path = os.path.splitext(file_path)[0] + '.wav'
                convert_to_wav(file_path, wav_file_path)
                audio_data = load_audio(wav_file_path)
                mfccs = extract_mfcc(audio_data)
                X.append(mfccs)
                y.append(label)
    return np.array(X), np.array(y)

In [9]:
# Define data directory
data_dir = '/content/drive/My Drive/VoxCeleb_gender/'

In [10]:
# Preprocess audio files
X, y = preprocess_audio(data_dir)

In [11]:
# Perform feature selection using SelectKBest and ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=20)  # Select top 20 features
X_selected = selector.fit_transform(X, y)

In [12]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [13]:
# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# Convert labels to one-hot encoding
num_classes = 2  # females and males
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

In [15]:
# Reshape input data to include time steps
X_train_scaled = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_scaled = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))


In [16]:
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import LearningRateScheduler

In [17]:
# Define LSTM model with learning rate scheduling and batch normalization
model = Sequential([
    LSTM(units=128, input_shape=(1, X_train_scaled.shape[2]), return_sequences=True),
    Dropout(0.5),
    BatchNormalization(),
    LSTM(units=128),
    Dropout(0.5),
    Dense(units=num_classes, activation='softmax')
])

In [18]:
# Learning rate scheduler function
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr  # Keep the initial learning rate for the first 10 epochs
    else:
        return lr * tf.math.exp(-0.1)

In [19]:
# Compile model with learning rate scheduler
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
# Define learning rate scheduler callback
lr_callback = LearningRateScheduler(lr_scheduler)

In [21]:
# Train model with learning rate scheduler and batch normalization
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=64, validation_data=(X_test_scaled, y_test), callbacks=[lr_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [22]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Test Loss: 0.27884936332702637
Test Accuracy: 0.8884263038635254


In [23]:
from google.colab import files

In [24]:
# Upload audio file
uploaded = files.upload()

Saving audio1.opus to audio1.opus


In [25]:
# Convert uploaded file to wav format
uploaded_file_path = list(uploaded.keys())[0]
converted_wav_file_path = uploaded_file_path.split('.')[0] + '.wav'
convert_to_wav(uploaded_file_path, converted_wav_file_path)

In [26]:
# Load and preprocess the converted wav file
audio_data = load_audio(converted_wav_file_path)
mfccs = extract_mfcc(audio_data)
selected_mfccs = selector.transform(mfccs.reshape(1, -1))  # Apply feature selection
scaled_mfccs = scaler.transform(selected_mfccs)  # Apply feature scaling
input_data = np.reshape(scaled_mfccs, (1, 1, scaled_mfccs.shape[1]))


In [27]:
# Predict gender using the trained model
prediction = model.predict(input_data)
predicted_class = np.argmax(prediction)
gender = "female" if predicted_class == 0 else "male"
print(f"The predicted gender for {converted_wav_file_path} is: {gender}")

The predicted gender for audio1.wav is: female
