<a href="https://colab.research.google.com/github/inachenyx/SpeechSNN/blob/main/SpeakerExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torchaudio snntorch tqdm

Collecting snntorch
  Downloading snntorch-0.9.4-py2.py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0

### Speaker Recognition Pipeline

In [2]:
import torch
import torchaudio
import torchaudio.transforms as T
import snntorch as snn
from snntorch import utils
from torch import nn, optim
from torch.utils.data import DataLoader, random_split
from torchaudio.datasets import SPEECHCOMMANDS
import os
import random
import numpy as np
from tqdm import tqdm

# Set seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Dataset Setup

In [3]:
# Custom subclass to extract speaker labels
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__(".", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as f:
                return [os.path.join(self._path, line.strip()) for line in f]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

train_dataset = SubsetSC("training")
test_dataset = SubsetSC("testing")


100%|██████████| 2.26G/2.26G [00:32<00:00, 75.1MB/s]


In [None]:
mfcc_transform = T.MFCC(
    sample_rate=16000,
    n_mfcc=13,
    melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40},
)

def extract_mfcc(dataset, max_len=80):
    features, labels, speakers = [], [], []
    speaker_to_idx = {}
    speaker_count = 0

    for waveform, sample_rate, _, speaker_id, _ in tqdm(dataset):
        mfcc = mfcc_transform(waveform).squeeze(0)  # [n_mfcc, time]
        mfcc = mfcc[:, :max_len]  # crop or pad
        if mfcc.shape[1] < max_len:
            pad = torch.zeros(mfcc.shape[0], max_len - mfcc.shape[1])
            mfcc = torch.cat([mfcc, pad], dim=1)

        if speaker_id not in speaker_to_idx:
            speaker_to_idx[speaker_id] = speaker_count
            speaker_count += 1

        features.append(mfcc)
        labels.append(speaker_to_idx[speaker_id])

    return torch.stack(features), torch.tensor(labels), speaker_to_idx

X_train, y_train, speaker_dict = extract_mfcc(train_dataset)
X_test, y_test, _ = extract_mfcc(test_dataset)
