In [1]:
import wespeaker
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
class VoiceAuth:
    def __init__(self, gpu=True):
        self.model = wespeaker.load_model("english")
        self.model.set_gpu(0 if gpu else -0.2137)  # set_gpu to enable the cuda inference, number < 0 means using CPU

    def register(self, speaker_id, audio_fpath):
        self.model.register(speaker_id, audio_fpath)

    def recognize(self, audio_fpath):
        return self.model.recognize(audio_fpath)

    def extract_embedding(self, audio_fpath):
        return self.model.extract_embedding(audio_fpath)

    def compute_similarity(self, audio1_fpath, audio2_fpath):
        return self.model.compute_similarity(audio1_fpath, audio2_fpath)

    def diarize(self, audio_fpath):
        return self.model.diarize(audio_fpath)

In [3]:
voice_auth = VoiceAuth(gpu=True)



In [4]:
voice_auth.register("kiddo", "../data/speech/test_kiddo.wav")

In [5]:
voice_auth.recognize("../data/speech/test_kiddo2.wav")

{'name': 'kiddo', 'confidence': 0.8216253519058228}

In [6]:
voice_auth.recognize("../data/speech/test_baba.wav")

{'name': 'kiddo', 'confidence': 0.4939890019595623}

In [7]:
voice_auth.extract_embedding("../data/speech/test_kiddo.wav")

tensor([ 0.0167, -0.0538, -0.0618,  0.0565,  0.0041, -0.0066,  0.0452,  0.0706,
         0.0290,  0.0392,  0.0530, -0.0534, -0.0267,  0.1458, -0.0901,  0.0207,
         0.0305, -0.1004, -0.0178,  0.0474,  0.0086,  0.0431,  0.0554,  0.0493,
        -0.0545,  0.0652, -0.0171,  0.0260,  0.0546, -0.0230, -0.0894, -0.0105,
         0.1120, -0.0152,  0.0238, -0.0394, -0.0427, -0.0344,  0.0498,  0.0069,
        -0.0123, -0.0674,  0.0517, -0.0314,  0.0040, -0.0280, -0.0021,  0.0494,
         0.0656,  0.0201, -0.0147,  0.0098,  0.0369, -0.0293, -0.0634, -0.0307,
        -0.0581,  0.0486, -0.0383,  0.0019, -0.0140, -0.0305,  0.0166,  0.0271,
        -0.0300,  0.0590,  0.0367,  0.0574,  0.0036, -0.0534, -0.0622, -0.0599,
        -0.0787, -0.0500,  0.0238,  0.0027,  0.0452,  0.0584, -0.0205, -0.0422,
        -0.0650,  0.0112,  0.0304,  0.0732, -0.0042,  0.0462, -0.0399,  0.0234,
         0.0614,  0.1059, -0.0856, -0.0213,  0.0176, -0.0793, -0.0100,  0.0076,
         0.0061, -0.0465,  0.0205, -0.01

In [8]:
from speech_users_db import SpeechUsersDB

db = SpeechUsersDB(lang="english")



In [9]:
registration_df = pd.read_csv("../data/speech/concat_audio_registration_df.csv")
registration_df

Unnamed: 0,speaker_id,video_id,format,channels,sample_width,frame_rate,number_of_frames,duration_seconds
0,id00694,IgUtEqwqXgg,wav,2,2,16000,588800,36.80
1,id00799,ELtVwFSaak0,wav,2,2,16000,119680,7.48
2,id00868,9SHE8pQGkf4,wav,2,2,16000,243200,15.20
3,id01616,18lc1aMx5ZM,wav,2,2,16000,583680,36.48
4,id02799,aqfXg_jXqjw,wav,2,2,16000,16000,1.00
...,...,...,...,...,...,...,...,...
94,id58163,jqZ3ZWcYvnU,wav,2,2,16000,344320,21.52
95,id58257,aCN3x_uGAEs,wav,2,2,16000,17920,1.12
96,id60231,oRAyifLDyd4,wav,2,2,16000,597120,37.32
97,id60691,4emJHSSJhnU,wav,2,2,16000,169600,10.60


In [10]:
registration_df["speaker_id"].nunique()

99

In [12]:
SPEECH_DATA_DIR = Path("../data/speech/")
AUDIO_DIR = SPEECH_DATA_DIR / "concat_audio"
USER_DB_PATH = SPEECH_DATA_DIR / "concat_audio_users_db.pkl"

In [None]:
for index, row in tqdm(list(registration_df.iterrows())):
    audio_fpath = AUDIO_DIR / row["speaker_id"] / (row["video_id"] + "." + row["format"])
    db.add_record(row["speaker_id"], audio_fpath)

In [13]:
db.save_db(USER_DB_PATH)