In [12]:
import os
import numpy as np
import librosa
import sounddevice as sd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import pickle
from scipy.spatial.distance import cosine

In [13]:
def extract_melspectrogram(file_path, n_mels=128):
    audio, sample_rate = librosa.load(file_path, sr=None)
    melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
    log_melspectrogram = librosa.power_to_db(melspectrogram)
    return log_melspectrogram

def extract_and_save_melspectrogram(file_path, save_path, n_mels=128):
    audio, sample_rate = librosa.load(file_path, sr=None)
    melspectrogram = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
    log_melspectrogram = librosa.power_to_db(melspectrogram)
    np.save(save_path, log_melspectrogram)
    return log_melspectrogram

def process_and_save_spectrograms(data_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for speaker in os.listdir(data_dir):
        speaker_dir = os.path.join(data_dir, speaker)
        if os.path.isdir(speaker_dir):
            speaker_output_dir = os.path.join(output_dir, speaker)
            if not os.path.exists(speaker_output_dir):
                os.makedirs(speaker_output_dir)
            for file_name in os.listdir(speaker_dir):
                if file_name.endswith('.wav'):
                    file_path = os.path.join(speaker_dir, file_name)
                    save_path = os.path.join(speaker_output_dir, file_name.replace('.wav', '.npy'))
                    extract_and_save_melspectrogram(file_path, save_path)

data_dir = 'dataset/'
output_dir = 'spectrogram/'
process_and_save_spectrograms(data_dir, output_dir)

In [8]:
def load_spectrograms(data_dir):
    features = []
    labels = []
    for speaker in os.listdir(data_dir):
        speaker_dir = os.path.join(data_dir, speaker)
        if os.path.isdir(speaker_dir):
            for file_name in os.listdir(speaker_dir):
                if file_name.endswith('.npy'):
                    file_path = os.path.join(speaker_dir, file_name)
                    melspectrogram = np.load(file_path)
                    if melspectrogram.shape[1] >= 128:
                        melspectrogram = melspectrogram[:, :128]
                        features.append(melspectrogram)
                        labels.append(speaker)
    return np.array(features), np.array(labels)

features, labels = load_spectrograms(output_dir)

# Encode labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)
labels_one_hot = to_categorical(labels_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels_one_hot, test_size=0.2, random_state=42)

# Expand dimensions to match the input shape of CNN (samples, height, width, channels)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

In [9]:
# Create CNN model
def create_cnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    return model

input_shape = (128, 128, 1)
num_classes = len(le.classes_)
model = create_cnn_model(input_shape, num_classes)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 126, 126, 32)      320       
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 63, 63, 32)       0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 30, 30, 64)       0         
 2D)                                                             
                                                                 
 conv2d_5 (Conv2D)           (None, 28, 28, 128)       73856     
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 14, 14, 128)     

In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Accuracy: 87.50%


In [11]:
model.save('speaker_verification_model.h5')

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [None]:
from scipy.spatial.distance import cosine

model = tf.keras.models.load_model('speaker_verification_model.h5')

with open('label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

def get_embeddings(model, spectrogram):
    spectrogram = np.expand_dims(spectrogram, axis=-1)
    spectrogram = np.expand_dims(spectrogram, axis=0)
    embeddings = model.predict(spectrogram)
    return embeddings

def verify_user(live_embeddings, registered_embeddings):
    similarity = 1 - cosine(live_embeddings, registered_embeddings)
    return similarity

live_spectrogram = extract_melspectrogram('path_to_live_audio.wav')
registered_spectrogram = extract_melspectrogram('path_to_registered_audio.wav')
live_embeddings = get_embeddings(model, live_spectrogram)
registered_embeddings = get_embeddings(model, registered_spectrogram)
similarity = verify_user(live_embeddings, registered_embeddings)
print(f'Similarity: {similarity:.2f}')