## Importing Libraries 
These libraries are used for data manipulation (pandas, numpy), audio processing (librosa), machine learning (sklearn), and saving/loading models (joblib).

In [4]:
import pandas as pd
import numpy as np
import librosa
import os
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import joblib

# Setting Constants
NUM_COMPONENTS: The number of Gaussian components in the UBM. A higher number allows the model to capture more complex distributions but increases computational cost.
  - NUM_IVECTORS: The dimensionality of the i-vectors. This is the reduced dimension after applying PCA to the Baum-Welch statistics.
  - COVARIANCE_TYPE: The type of covariance matrix used in the GMM. 'diag' assumes diagonal covariance matrices, which simplifies computation.
  - EPS: A small constant to avoid division by zero.

In [1]:
FOLDER_PATH = '../data/clips'
NUM_COMPONENTS = 128  
NUM_IVECTORS = 13     
COVARIANCE_TYPE = 'diag'
EPS = 1e-6 

# Audio Preprocessing and Feature Extraction
Normalization*: Normalizes the audio signal to have zero mean and unit variance, which helps in stabilizing the feature extraction process.
  - *MFCCs*: Mel-Frequency Cepstral Coefficients are a representation of the short-term power spectrum of a sound. They are widely used in speech processing because they capture the characteristics of the human voice.
  - *Delta and Delta-Delta*: These are the first and second derivatives of the MFCCs, capturing dynamic information about how the MFCCs change over time.
  - *Concatenation*: Combines the MFCCs, delta, and delta-delta features into a single feature vector.

In [None]:
def normalize_audio(y):
    return librosa.util.normalize(y)

def extract_mfcc(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)
    y = normalize_audio(y)  
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfcc)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)
    mfcc_features = np.concatenate((mfcc, delta_mfcc, delta2_mfcc), axis=0)  
    return mfcc_features.T  

# Baum-Welch Statistics
The posterior probabilities that a feature vector belongs to each Gaussian component in the UBM.
  - *N*: The zero-order Baum-Welch statistic, representing the total responsibility of each Gaussian component.
  - *F*: The first-order Baum-Welch statistic, representing the weighted sum of the feature vectors for each Gaussian component.

In [None]:
def compute_baum_welch_statistics(mfccs, ubm):
    responsibilities = ubm.predict_proba(mfccs)
    N = np.sum(responsibilities, axis=0)
    F = np.dot(responsibilities.T, mfccs)
    return N, F

# i-Vector Extraction
- *i-Vector*: A low-dimensional representation of the speaker characteristics. It is obtained by projecting the Baum-Welch statistics onto a low-dimensional subspace (defined by the T-matrix).
- *T-Matrix*: A matrix that maps the high-dimensional Baum-Welch statistics to the low-dimensional i-vector space.

In [None]:
def extract_ivector(F, N, t_matrix):
    N = N + EPS
    ivector = t_matrix.transform(F / N[:, np.newaxis])
    return ivector.flatten()

# Length Normalization
- *Length Normalization*: Ensures that the i-vectors lie on a hypersphere, which improves the performance of the speaker verification.

In [None]:
def length_normalize(ivector):
    return ivector / np.linalg.norm(ivector)

# PLDA Training
- *PLDA*: A generative model that captures the variability between speakers and within-speaker variability. It is used to compute the likelihood ratio for speaker verification.

In [2]:
def train_plda(ivectors, labels):
    plda = LinearDiscriminantAnalysis()
    plda.fit(ivectors, labels)
    return plda

def verify_speaker_plda(test_ivector, reference_ivector, plda):
    test_ivector = test_ivector.reshape(1, -1)
    reference_ivector = reference_ivector.reshape(1, -1)
    score = plda.predict_proba(np.vstack((test_ivector, reference_ivector)))[0, 1]
    return score

# Loading Audio Files and Extracting MFCCs
- *MFCC Extraction*: Converts the raw audio signal into a set of features that capture the spectral characteristics of the voice.

In [5]:
mfccs_list = []
for filename in os.listdir(FOLDER_PATH):
   if filename.lower().endswith((".mp3",".mp4",".wav",".flac")):
        audio_file = os.path.join(FOLDER_PATH, filename)
        mfcc_features = extract_mfcc(audio_file)
        mfccs_list.append(mfcc_features)

mfccs_array = np.vstack(mfccs_list)

# Normalizing MFCCs
- *Standardization*: Ensures that all features are on the same scale, which is important for the performance of the GMM.

In [17]:
scaler = StandardScaler()
mfccs_array = scaler.fit_transform(mfccs_array)

# Training the UBM
- *UBM*: A GMM that represents the distribution of audio features across all speakers. It is used as a reference model for computing speaker-specific i-vectors

In [18]:
ubm = GaussianMixture(n_components=NUM_COMPONENTS, covariance_type=COVARIANCE_TYPE, max_iter=100, random_state=42)
ubm.fit(mfccs_array)
joblib.dump(ubm, 'ubm_model.pkl')



['ubm_model.pkl']

# Computing Baum-Welch Statistics
Computes the Baum-Welch statistics for the MFCC features using the trained UBM.

In [19]:
N, F = compute_baum_welch_statistics(mfccs_array, ubm)

# Training the T-Matrix
Trains a PCA model to reduce the dimensionality of the Baum-Welch statistics.

In [20]:
t_matrix = PCA(n_components=NUM_IVECTORS)
t_matrix.fit(F)
joblib.dump(t_matrix, 't_matrix.pkl')

['t_matrix.pkl']

# Extracting i-Vectors
Extracts i-vectors from the Baum-Welch statistics and normalizes them.

In [21]:
ivectors = extract_ivector(F, N, t_matrix)
ivectors = np.array([length_normalize(ivector) for ivector in ivectors])

# Saving i-Vectors
Saves the extracted i-vectors to a file for later use.

In [22]:
np.save('ivectors.npy', ivectors)