Import libraries

In [1]:
import os
import torch
from speechbrain.pretrained import SpeakerRecognition
from speechbrain.pretrained import EncoderClassifier
from speechbrain.pretrained import Pretrained

import sqlite3
import tempfile
import re
import concurrent.futures
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


Define the SpeakerVerification class

In [8]:
class SpeakerVerification:
    def __init__(self, reference_folder, folder_path, device):
        self.reference_folder = reference_folder
        self.folder_path = folder_path
        self.device = device

        self.sr_model, self.ec_model, self.p_model = self.load_models()

    def load_models(self):
        sr_model = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb").to(self.device)
        ec_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb").to(self.device)
        p_model = Pretrained.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb").to(self.device)

        return sr_model, ec_model, p_model

    def extract_reference_embeddings(self):
        emb_refs = []

        for ref_file in os.listdir(self.reference_folder):
            if ref_file.endswith('.wav'):
                print(f"Creating embedding for reference file: {ref_file}")
                waveform_x = self.p_model.load_audio(path=os.path.join(self.reference_folder, ref_file))
                batch_x = waveform_x.unsqueeze(0).to(self.device)

                emb_ref = self.ec_model.encode_batch(batch_x, normalize=True).to(self.device)

                
                emb_refs.append(emb_ref)

        return emb_refs

    def compare_embeddings(self, emb_refs, emb_y):
        similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)

        scores = [similarity(emb_y, emb_ref) for emb_ref in emb_refs]

        max_score = np.max(scores)
        is_speaker = 1 if max_score > 0.25 else 0
        return is_speaker

    def verify(self, emb_refs):
        # Use the verification metadata file
        # Create one if it doesn't exist
        def sort_key(file):
            match = re.match(r'(.*)_segment(\d+)', file)
            if match:
                return match.group(1), int(match.group(2))
            else:
                return file, 0
        ver_metadata_file = 'verification_metadata.txt'
        if not os.path.exists(ver_metadata_file):
            with open(ver_metadata_file, 'w') as f:
                f.write('')
    
        files = os.listdir(self.folder_path)
        total_files = len(files)
        files.sort(key=sort_key)
        count = 0   
        for file in files:
            if file.endswith('.mp3'):
                try:
                    waveform_y = self.p_model.load_audio(path=os.path.join(self.folder_path, file))
                    batch_y = waveform_y.unsqueeze(0).to(self.device)
                    emb_y = self.ec_model.encode_batch(batch_y, normalize=True)
                    is_speaker = self.compare_embeddings(emb_refs, emb_y)

                    
                    with open(ver_metadata_file, 'a') as f:
                        f.write(f"{file}|{is_speaker}\n")
          
                    
                except Exception as e:
                    print(f"Error verifying file: {file}")
            
            # print progress percentage
            count += 1
            print(f"Progress: {count}/{total_files} ({count/total_files*100:.2f}%)", end='\r')



Set device and folder paths

In [9]:
device = torch.device('cpu')  # use GPU if available, else CPU


reference_folder = r"C:\Users\Harsh\Documents\gap\gapvoice\audio_preprocessing\src\standard"
folder_path = r"C:\Users\Harsh\Documents\gap\gapvoice\audio_preprocessing\src\segments"


Initialize SpeakerVerification and extract reference embeddings

In [10]:
spk_verify = SpeakerVerification(reference_folder, folder_path, device)
emb_refs = spk_verify.extract_reference_embeddings()


Creating embedding for reference file: I Finally Listened To You.mp3_segment_000.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_001.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_002.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_003.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_004.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_005.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_006.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_007.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_008.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_009.wav
Creating embedding for reference file: I Finally Listened To You.mp3_segment_010.wav
Creating embedding for reference file: I Finally Listened To You.

Run verification

In [11]:
spk_verify.verify(emb_refs)


Progress: 73062/73062 (100.00%)