# Installs

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install transformers



In [3]:
#!pip install git+https://github.com/speechbrain/speechbrain.git@develop

In [4]:
import os
from pathlib import Path
from typing import List, Tuple, Union
from tqdm import tqdm
from torch import Tensor
from torch.utils.data import Dataset
from torchaudio._internal import download_url_to_file
from torchaudio.datasets.utils import _extract_zip, _load_waveform

# Dataset Class For Voxceleb from TorchAudio


In [5]:
SAMPLE_RATE = 16000
_ARCHIVE_CONFIGS = {
    # "dev": {
    #     "archive_name": "vox1_dev_wav.zip",
    #     "urls": [
    #         "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
    #         "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
    #         "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
    #         "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
    #     ],
    #     "checksums": [
    #         "21ec6ca843659ebc2fdbe04b530baa4f191ad4b0971912672d92c158f32226a0",
    #         "311d21e0c8cbf33573a4fce6c80e5a279d80736274b381c394319fc557159a04",
    #         "92b64465f2b2a3dc0e4196ae8dd6828cbe9ddd1f089419a11e4cbfe2e1750df0",
    #         "00e6190c770b27f27d2a3dd26ee15596b17066b715ac111906861a7d09a211a5",
    #     ],
    # },
    "test": {
        "archive_name": "vox1_test_wav.zip",
        "url": "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip",
        "checksum": "8de57f347fe22b2c24526e9f444f689ecf5096fc2a92018cf420ff6b5b15eaea",
    },
}
_IDEN_SPLIT_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt"
_VERI_TEST_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"


def _download_extract_wavs(root: str):
    for archive in ["test"]:
        archive_name = _ARCHIVE_CONFIGS[archive]["archive_name"]
        archive_path = os.path.join(root, archive_name)
        # The zip file of dev data is splited to 4 chunks.
        # Download and combine them into one file before extraction.
        if archive == "dev":
            urls = _ARCHIVE_CONFIGS[archive]["urls"]
            checksums = _ARCHIVE_CONFIGS[archive]["checksums"]
            with open(archive_path, "wb") as f:
                for url, checksum in zip(urls, checksums):
                    file_path = os.path.join(root, os.path.basename(url))
                    download_url_to_file(url, file_path, hash_prefix=checksum)
                    with open(file_path, "rb") as f_split:
                        f.write(f_split.read())
        else:
            url = _ARCHIVE_CONFIGS[archive]["url"]
            checksum = _ARCHIVE_CONFIGS[archive]["checksum"]
            download_url_to_file(url, archive_path, hash_prefix=checksum)
        _extract_zip(archive_path)


def _get_flist(root: str, file_path: str, subset: str) -> List[str]:
    f_list = []
    if subset == "train":
        index = 1
    elif subset == "dev":
        index = 2
    else:
        index = 3
    with open(file_path, "r") as f:
        for line in f:
            id, path = line.split()
            if int(id) == index:
                f_list.append(path)
    return sorted(f_list)


def _get_paired_flist(root: str, veri_test_path: str):
    f_list = []
    with open(veri_test_path, "r") as f:
        for line in f:
            label, path1, path2 = line.split()
            f_list.append((label, path1, path2))
    return f_list


def _get_file_id(file_path: str, _ext_audio: str):
    speaker_id, youtube_id, utterance_id = file_path.split("/")[-3:]
    utterance_id = utterance_id.replace(_ext_audio, "")
    file_id = "-".join([speaker_id, youtube_id, utterance_id])
    return file_id


In [6]:
class VoxCeleb1(Dataset):
    """*VoxCeleb1* :cite:`nagrani2017voxceleb` dataset.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (Default: ``False``).
    """

    _ext_audio = ".wav"

    def __init__(self, root: Union[str, Path], download: bool = False) -> None:
        # Get string representation of 'root' in case Path object is passed
        root = os.fspath(root)
        self._path = os.path.join(root, "wav")
        if not os.path.isdir(self._path):
            if not download:
                raise RuntimeError(
                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
                )
            _download_extract_wavs(root)

    def get_metadata(self, n: int):
        raise NotImplementedError

    def __getitem__(self, n: int):
        raise NotImplementedError

    def __len__(self) -> int:
        raise NotImplementedError


In [7]:
class VoxCeleb1Verification(VoxCeleb1):
    """*VoxCeleb1* :cite:`nagrani2017voxceleb` dataset for speaker verification task.

    Each data sample contains a pair of waveforms, sample rate, the label indicating if they are
    from the same speaker, and the file ids.

    Args:
        root (str or Path): Path to the directory where the dataset is found or downloaded.
        meta_url (str, optional): The url of meta file that contains a list of utterance pairs
            and the corresponding labels. The format of each row is ``label file_path1 file_path2".
            For example: ``1 id10270/x6uYqmx31kE/00001.wav id10270/8jEAjG6SegY/00008.wav``.
            ``1`` means the two utterances are from the same speaker, ``0`` means not.
            (Default: ``"https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt"``)
        download (bool, optional):
            Whether to download the dataset if it is not found at root path. (Default: ``False``).

    Note:
        The file structure of `VoxCeleb1Verification` dataset is as follows:

        └─ root/

         └─ wav/

         └─ speaker_id folders

        Users who pre-downloaded the ``"vox1_dev_wav.zip"`` and ``"vox1_test_wav.zip"`` files need to move
        the extracted files into the same ``root`` directory.
    """

    def __init__(self, root: Union[str, Path], meta_url: str = _VERI_TEST_URL, download: bool = False) -> None:
        super().__init__(root, download)
        # download the veri_test.txt to get the list of training pairs and labels.
        meta_list_path = os.path.join(root, os.path.basename(meta_url))
        if not os.path.exists(meta_list_path):
            download_url_to_file(meta_url, meta_list_path)
        self._flist = _get_paired_flist(self._path, meta_list_path)

    def get_metadata(self, n: int) -> Tuple[str, str, int, int, str, str]:
        """Get metadata for the n-th sample from the dataset. Returns filepaths instead of waveforms,
        but otherwise returns the same fields as :py:func:`__getitem__`.

        Args:
            n (int): The index of the sample

        Returns:
            Tuple of the following items;

            str:
                Path to audio file of speaker 1
            str:
                Path to audio file of speaker 2
            int:
                Sample rate
            int:
                Label
            str:
                File ID of speaker 1
            str:
                File ID of speaker 2
        """
        label, file_path_spk1, file_path_spk2 = self._flist[n]
        label = int(label)
        file_id_spk1 = _get_file_id(file_path_spk1, self._ext_audio)
        file_id_spk2 = _get_file_id(file_path_spk2, self._ext_audio)
        return file_path_spk1, file_path_spk2, SAMPLE_RATE, label, file_id_spk1, file_id_spk2


    def __getitem__(self, n: int) -> Tuple[Tensor, Tensor, int, int, str, str]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded.

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform of speaker 1
            Tensor:
                Waveform of speaker 2
            int:
                Sample rate
            int:
                Label
            str:
                File ID of speaker 1
            str:
                File ID of speaker 2
        """
        metadata = self.get_metadata(n)
        waveform_spk1 = _load_waveform(self._path, metadata[0], metadata[2])
        waveform_spk2 = _load_waveform(self._path, metadata[1], metadata[2])
        return (waveform_spk1, waveform_spk2) + metadata[2:]


    def __len__(self) -> int:
        return len(self._flist)

# Loading Dataset Voxceleb

In [8]:
dataset = VoxCeleb1Verification(root="/content/", download = True)

100%|██████████| 1.00G/1.00G [01:08<00:00, 15.7MB/s]
100%|██████████| 2.23M/2.23M [00:01<00:00, 1.96MB/s]


In [9]:
dataset.get_metadata(0)

('id10270/x6uYqmx31kE/00001.wav',
 'id10270/8jEAjG6SegY/00008.wav',
 16000,
 1,
 'id10270-x6uYqmx31kE-00001',
 'id10270-8jEAjG6SegY-00008')

In [10]:
dataset[2]

(tensor([[ 0.0068,  0.0064,  0.0049,  ..., -0.0047, -0.0041, -0.0033]]),
 tensor([[ 0.0321,  0.0477,  0.0530,  ..., -0.0049, -0.0080, -0.0087]]),
 16000,
 1,
 'id10270-x6uYqmx31kE-00001',
 'id10270-GWXujl-xAVM-00017')

In [11]:
def getPath(path):
    newPath = "/content/wav/" + path[:7] + "/" + path[8:-6] + "/" + path[-5:] + ".wav"
    return newPath

In [12]:
getPath(dataset[2][5])

'/content/wav/id10270/GWXujl-xAVM/00017.wav'

In [13]:
"/content/wav/" + dataset[0][4].replace("-", "/") + ".wav"

'/content/wav/id10270/x6uYqmx31kE/00001.wav'

# EER Calculation

In [14]:
import numpy as np
import sklearn.metrics

"""
Python compute equal error rate (eer)
ONLY tested on binary classification

:param label: ground-truth label, should be a 1-d list or np.array, each element represents the ground-truth label of one sample
:param pred: model prediction, should be a 1-d list or np.array, each element represents the model prediction of one sample
:param positive_label: the class that is viewed as positive class when computing EER
:return: equal error rate (EER)
"""
def compute_eer(label, pred, positive_label=1):
    # all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
    fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred)
    fnr = 1 - tpr

    # the threshold of fnr == fpr
    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]

    # theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
    eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
    eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]

    # return the mean of eer from fpr and from fnr
    eer = (eer_1 + eer_2) / 2
    return eer

# Ecapa


In [None]:
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cuda"})
# signal, fs =torchaudio.load('/content/Aadiksha_6.wav')

hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [None]:
embeddings = classifier.encode_batch(dataset[0][0])

In [None]:
from speechbrain.inference.speaker import SpeakerRecognition
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
print(verification.verify_files(getPath(dataset[2][4]),getPath(dataset[2][5])))
# print(verification.verify_files("/content/NarendraModi_1.wav", "/content/NarendraModi_2.wav"))


(tensor([0.5755]), tensor([True]))


In [None]:
print(verification.verify_files(getPath(dataset[2][4]),getPath(dataset[2][5]))[0].item())

0.5755155682563782


In [None]:
len(dataset)

37720

In [None]:
predVal = []
trueVal = []
for i in tqdm(range(1000)):
    predVal.append(verification.verify_files(getPath(dataset[i][4]),getPath(dataset[i][5]))[0].item())
    trueVal.append(dataset[i][3])

100%|██████████| 1000/1000 [20:06<00:00,  1.21s/it]


In [None]:
eer = compute_eer(trueVal, predVal)
print(eer)

0.46299999999999997


# Wavlm Base +

In [15]:
#!pip install transformers

In [16]:
from transformers import AutoFeatureExtractor, WavLMForXVector
# from datasets import load_dataset
import torch

# dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
# dataset = dataset.sort("id")
# sampling_rate = dataset.features["audio"].sampling_rate

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv")

# # audio file is decoded on the fly
# inputs = feature_extractor(
#     [d["array"] for d in dataset[:20]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
# )


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/58.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/wavlm-base-plus-sv were not used when initializing WavLMForXVector: ['wavlm.encoder.pos_conv_embed.conv.weight_g', 'wavlm.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMForXVector from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMForXVector from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-base-plus-sv and are newly initialized: ['wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wavlm.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a d

In [17]:
# inputs = feature_extractor(
#     [d["array"] for d in dataset[41:60]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
# )

In [18]:
model = model.to(torch.device("cuda"))

In [19]:
# inputs = inputs.to(torch.device("cuda"))

In [20]:
# with torch.no_grad():
#     embeddings = model(**inputs).embeddings

In [21]:
# embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

In [22]:
# the resulting embeddings can be used for cosine similarity-based retrieval
# cosine_sim = torch.nn.CosineSimilarity(dim=-1)
# similarity = cosine_sim(embeddings[0], embeddings[1])
# threshold = 0.7  # the optimal threshold is dataset-dependent
# if similarity < threshold:
#     print("Speakers are not the same!")
# round(similarity.item(), 2)

In [23]:
# del embeddings

In [24]:
# torch.cuda.empty_cache()

In [25]:
# del embeddings

In [26]:
predVal = []
trueVal = []
index = 0
for i in range(1000):
    # data = []
    #     data.append(dataset[index][0][0].tolist())
    #     data.append(dataset[index][1][0].tolist())
    trueVal.append(dataset[i][3])
    #     index+=1
    # print(torch.Tensor(data))
    # break
    # data = torch.FloatTensor(data).to(torch.device("cuda"))
    with torch.no_grad():
        inputs = feature_extractor([dataset[i][0][0].tolist(), dataset[i][1][0].tolist()],sampling_rate = SAMPLE_RATE, padding=True, return_tensors="pt")
        inputs = inputs.to(torch.device("cuda"))
        embeddings = model(**inputs).embeddings
        embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
        similarity = cosine_sim(embeddings[0], embeddings[1])
        predVal.append(similarity)
    # del embeddings
    # break

In [27]:
preds = [x.item() for x in predVal]

In [28]:
len(preds),len(trueVal)

(1000, 1000)

In [31]:
eer = compute_eer(trueVal, preds)
print(eer)

0.10000000000000006


# WaveLM Large

In [None]:
#!pip install datasets

In [None]:
from transformers import AutoFeatureExtractor, WavLMForXVector
import torch

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-large")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-large")



preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of WavLMForXVector were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'feature_extractor.bias', 'feature_extractor.weight', 'objective.weight', 'projector.bias', 'projector.weight', 'tdnn.0.kernel.bias', 'tdnn.0.kernel.weight', 'tdnn.1.kernel.bias', 'tdnn.1.kernel.weight', 'tdnn.2.kernel.bias', 'tdnn.2.kernel.weight', 'tdnn.3.kernel.bias', 'tdnn.3.kernel.weight', 'tdnn.4.kernel.bias', 'tdnn.4.kernel.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# from datasets import load_dataset
# dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
# audio = [x["array"] for x in dataset[:2]["audio"]]


In [None]:
# len(audio[0]),len(audio[1])

In [None]:
model = model.to(torch.device("cuda"))

In [None]:
predVal = []
trueVal = []
index = 0
for i in range(1000):
    # data = []
    #     data.append(dataset[index][0][0].tolist())
    #     data.append(dataset[index][1][0].tolist())
    trueVal.append(dataset[i][3])
    #     index+=1
    # print(torch.Tensor(data))
    # break
    # data = torch.FloatTensor(data).to(torch.device("cuda"))
    with torch.no_grad():
        inputs = feature_extractor([dataset[i][0][0].tolist(), dataset[i][1][0].tolist()],sampling_rate = SAMPLE_RATE, padding=True, return_tensors="pt")
        inputs = inputs.to(torch.device("cuda"))
        embeddings = model(**inputs).embeddings
        embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
        similarity = cosine_sim(embeddings[0], embeddings[1])
        predVal.append(similarity)
    # del embeddings

In [None]:
preds = [x.item() for x in predVal]

In [None]:
eer = compute_eer(trueVal, preds)
print(eer)

0.386
