In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/tf_Wav_reader.py
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/611.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/364.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/367.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/116.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/1490.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/374.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/1359.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/485.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/456.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/626.wav
/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Nelson_Mandela/590.

In [12]:
!pip install torchaudio hmmlearn numpy torch



In [26]:
import os
import torchaudio
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from hmmlearn import hmm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


****Step 3: Define Preprocessing Functions**
# Load and preprocess audio files:**


In [27]:
import torchaudio
import torch

def load_audio(file_path, sample_rate=16000):
    """
    Loads an audio file and resamples it to the desired sample rate.
    
    Args:
    file_path (str): Path to the audio file.
    sample_rate (int): Desired sample rate.
    
    Returns:
    torch.Tensor: Resampled audio waveform.
    """
    waveform, original_sample_rate = torchaudio.load(file_path)
    if original_sample_rate != sample_rate:
        resample_transform = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=sample_rate)
        waveform = resample_transform(waveform)
    return waveform

def normalize_audio(waveform):
    """
    Normalizes the audio waveform to have values between -1 and 1.
    
    Args:
    waveform (torch.Tensor): Audio waveform.
    
    Returns:
    torch.Tensor: Normalized audio waveform.
    """
    return waveform / waveform.abs().max()

def trim_silence(waveform, threshold=0.01):
    """
    Trims leading and trailing silence from the audio waveform.
    
    Args:
    waveform (torch.Tensor): Audio waveform.
    threshold (float): Silence threshold.
    
    Returns:
    torch.Tensor: Trimmed audio waveform.
    """
    non_silent_indices = torch.where(waveform.abs() > threshold)[1]
    if non_silent_indices.size(0) == 0:
        return waveform
    start, end = non_silent_indices[0], non_silent_indices[-1]
    return waveform[:, start:end + 1]

def extract_mfcc(waveform, sample_rate=16000, n_mfcc=13):
    """
    Extracts MFCC features from the audio waveform.
    
    Args:
    waveform (torch.Tensor): Audio waveform.
    sample_rate (int): Sample rate of the audio.
    n_mfcc (int): Number of MFCC coefficients to extract.
    
    Returns:
    numpy.ndarray: Extracted MFCC features.
    """
    mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc)
    mfcc_features = mfcc_transform(waveform).squeeze(0).transpose(0, 1)
    return mfcc_features.numpy()



In [28]:
import torchaudio
import torch

def preprocess_audio(file_path, sample_rate=16000, n_mfcc=13):
    # Load the audio waveform
    waveform, original_sample_rate = torchaudio.load(file_path)
    if original_sample_rate != sample_rate:
        resample_transform = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=sample_rate)
        waveform = resample_transform(waveform)
    
    # Normalize the audio
    waveform = waveform / waveform.abs().max()
    
    # Trim silence (optional, can be removed if not needed)
    non_silent_indices = torch.where(waveform.abs() > 0.01)[1]
    if non_silent_indices.size(0) > 0:
        start, end = non_silent_indices[0], non_silent_indices[-1]
        waveform = waveform[:, start:end + 1]
    
    # Extract MFCC features
    mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc)
    mfcc_features = mfcc_transform(waveform).squeeze(0).transpose(0, 1)
    
    return mfcc_features.numpy()

In [29]:
#### Step 4: Define `SpeakerDataset` Class

class SpeakerDataset(Dataset):
    def __init__(self, audio_paths, labels, sample_rate=16000, num_mfcc=13, transform=None):
        self.audio_paths = audio_paths
        self.labels = labels
        self.sample_rate = sample_rate
        self.num_mfcc = num_mfcc
        self.transform = transform

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]
        mfcc_features = preprocess_audio(audio_path, self.sample_rate, self.num_mfcc)
        if self.transform:
            mfcc_features = self.transform(mfcc_features)
        return torch.tensor(mfcc_features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

**Step 6: Speaker Recognition**

In [42]:
def recognize_speaker(audio_path, models, sample_rate=16000, num_mfcc=13):
    mfcc_features = preprocess_audio(audio_path, sample_rate, num_mfcc)
    best_score = float('-inf')
    best_speaker = None
    
    for speaker, model in models.items():
        try:
            score = model.score(mfcc_features)
            if score > best_score:
                best_score = score
                best_speaker = speaker
        except Exception as e:
            print(f"Error scoring speaker {speaker}: {e}")
    
    return best_speaker

In [41]:
# Train HMM models
def train_hmm(speaker_data, n_components=4, n_iter=100):
    models = {}
    for speaker, features in speaker_data.items():
        try:
            model = hmm.GaussianHMM(n_components=n_components, n_iter=n_iter, covariance_type='diag')
            model.fit(features)
            models[speaker] = model
        except Exception as e:
            print(f"An error occurred while training HMM for speaker {speaker}: {e}")
    return models


**Example Usage******

In [43]:
# Prepare data
data_dir = '/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches'
audio_paths = []
labels = []

for speaker in os.listdir(data_dir):
    speaker_dir = os.path.join(data_dir, speaker)
    if os.path.isdir(speaker_dir):
        for file_name in os.listdir(speaker_dir):
            if file_name.endswith('.wav'):
                audio_paths.append(os.path.join(speaker_dir, file_name))
                labels.append(speaker)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split data into training and testing sets
train_paths, test_paths, train_labels, test_labels = train_test_split(audio_paths, encoded_labels, test_size=0.2, random_state=42)

# Create datasets and data loaders
train_dataset = SpeakerDataset(train_paths, train_labels)
test_dataset = SpeakerDataset(test_paths, test_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)




# You can now proceed to train HMM models or other machine learning models


In [47]:
import numpy as np
from hmmlearn import hmm

# Function to ensure array is 2D
def ensure_2d(array):
    if len(array.shape) == 1:
        return array.reshape(-1, 1)
    elif len(array.shape) == 3:
        # Handle the unexpected 3D shape
        if array.shape[0] == 13:
            return array.reshape(array.shape[0], -1)
        else:
            raise ValueError(f"Unexpected 3D shape: {array.shape}")
    return array

# Prepare speaker data for HMM training
speaker_data = {}

# Collect MFCC features for each speaker
for i in range(len(train_dataset)):
    mfcc_features, label = train_dataset[i]
    label = label.item()  # Convert label to a scalar value
    if label not in speaker_data:
        speaker_data[label] = []
    speaker_data[label].append(mfcc_features.numpy())

# Check the dimensions of each array and ensure they are 2D
for speaker in speaker_data:
    for i, features in enumerate(speaker_data[speaker]):
        speaker_data[speaker][i] = ensure_2d(features)

# Stack the arrays for each speaker
for speaker in speaker_data:
    try:
        speaker_data[speaker] = np.vstack(speaker_data[speaker])
    except ValueError as e:
        print(f"Error stacking features for speaker {speaker}: {e}")

# Print the number of speakers and some example data
print(f'Number of speakers: {len(speaker_data)}')
for speaker, features in speaker_data.items():
    print(f'Speaker: {speaker}, MFCC shape: {features.shape}')
    break  # Print only the first speaker's data

Error stacking features for speaker 5: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1064 and the array at index 1 has size 13
Number of speakers: 7
Speaker: 0, MFCC shape: (95722, 13)


In [48]:
# Train HMM models
models = train_hmm(speaker_data)



An error occurred while training HMM for speaker 5: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (4,) + inhomogeneous part.


In [51]:
# Example to recognize speaker
test_audio_path = '/kaggle/input/studentvoice/nitanyahuvoice.wav'
recognized_speaker = recognize_speaker(test_audio_path, models)
if recognized_speaker is not None:
    recognized_speaker_label = label_encoder.inverse_transform([recognized_speaker])[0]
    print(f'Recognized Speaker: {recognized_speaker_label}')
else:
    print('Speaker could not be recognized.')

Error scoring speaker 0: Found array with dim 3. None expected <= 2.
Error scoring speaker 2: Found array with dim 3. None expected <= 2.
Error scoring speaker 1: Found array with dim 3. None expected <= 2.
Error scoring speaker 3: Found array with dim 3. None expected <= 2.
Error scoring speaker 4: Found array with dim 3. None expected <= 2.
Error scoring speaker 6: Found array with dim 3. None expected <= 2.
Speaker could not be recognized.




In [52]:
import torchaudio
import IPython.display as ipd

# Load the WAV file
waveform, sample_rate = torchaudio.load()

# Print basic information about the audio
print(f"Sample Rate: {sample_rate}")
print(f"Waveform Shape: {waveform.shape}")

# Play the audio (works in Jupyter Notebook)
ipd.Audio(waveform.numpy(), rate=sample_rate)

Sample Rate: 44100
Waveform Shape: torch.Size([2, 675777])


In [66]:
import os
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
import IPython.display as ipd

class AudioDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.file_names = [file for file in os.listdir(root_dir) if file.endswith('.wav')]

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        audio_path = os.path.join(self.root_dir, self.file_names[idx])
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Ensure waveform is either 1D or 2D
        if len(waveform.shape) == 1:
            waveform = waveform.unsqueeze(0)  # Convert to 2D (1, N) for consistency
        if self.transform:
            waveform = self.transform(waveform)
        return waveform, sample_rate, self.file_names[idx]



# Define the path to your dataset directory
dataset_dir = '/kaggle/input/speaker-recognition-dataset/16000_pcm_speeches/Benjamin_Netanyau'

# Create an instance of the AudioDataset
audio_dataset = AudioDataset(root_dir=dataset_dir)

In [69]:
# Define the index or file name of the specific audio you want to listen to
audio_index =765  # For example, the first file in the dataset
# or
# audio_file_name = 'your_audio_file.wav'
# audio_index = audio_dataset.file_names.index(audio_file_name)

# Load the specific audio file
waveform, sample_rate, file_name = audio_dataset[audio_index]

# Print details about the loaded file
print(f"Loaded file: {file_name}")
print(f"Sample Rate: {sample_rate}")
print(f"Waveform Shape: {waveform.shape}")

# Play the audio
ipd.display(ipd.Audio(waveform.squeeze().numpy(), rate=sample_rate))

Loaded file: 245.wav
Sample Rate: 16000
Waveform Shape: torch.Size([1, 16000])
