In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import numpy as np
import pandas as pd

import librosa
from pathlib import Path

In [3]:
dir_path = os.getcwd()

In [4]:
train = pd.read_csv(os.path.join(dir_path, "..", "data/afritts-train-clean.csv"))


In [5]:
train.columns

Index(['idx', 'user_ids', 'accent', 'age_group', 'gender', 'country',
       'transcript', 'nchars', 'audio_ids', 'audio_paths', 'duration',
       'neg_percent', 'origin', 'domain', 'split', 'expand_puncts', 'is_alnum',
       'user_ids_num'],
      dtype='object')

In [6]:
import json
import pickle as pkl
def load_speaker_emb(path):
    print(f"Loading external speaker embeddings from {path}")
    if path.split(".")[-1] == "pkl":
        with open(path, "rb") as read_file:
            speaker_embeddings_dict = pkl.load(read_file)
    elif path.split(".")[-1] == "json":
        with open(path, "r") as read_file:
            speaker_embeddings_dict = json.load(read_file)
    else:
        raise TypeError("Speaker embedding type unrecognized")
    return speaker_embeddings_dict

In [7]:
speaker_emb_dict = load_speaker_emb("AfriSpeech-TTS/src/vits/AfriSpeech-Models/embeddings/afritts_emb_intsp.pkl")

Loading external speaker embeddings from /srv/storage/talc2@talc-data2.nancy/multispeech/calcul/users/sogun/AfriSpeech-TTS/src/vits/AfriSpeech-Models/embeddings/afritts_emb_intsp.pkl


In [8]:
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np

encoder = VoiceEncoder()

def compute_embedding(path):
    gen_wav = preprocess_wav(Path(path))

    gen_emb = encoder.embed_utterance(gen_wav)

    return gen_emb

Loaded the voice encoder model on cuda in 0.28 seconds.


In [9]:
import numpy as np
from sklearn.metrics import roc_curve

"""
Python compute equal error rate (eer)
ONLY tested on binary classification

:param label: ground-truth label, should be a 1-d list or np.array, each element represents the ground-truth label of one sample
:param pred: model prediction, should be a 1-d list or np.array, each element represents the model prediction of one sample
:param positive_label: the class that is viewed as positive class when computing EER
:return: equal error rate (EER)
"""
# def compute_eer(label, pred, positive_label=1):
#     # all fpr, tpr, fnr, fnr, threshold are lists (in the format of np.array)
#     fpr, tpr, threshold = sklearn.metrics.roc_curve(label, pred, positive_label)
#     fnr = 1 - tpr

#     # the threshold of fnr == fpr
#     eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]

#     # theoretically eer from fpr and eer from fnr should be identical but they can be slightly differ in reality
#     eer_1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
#     eer_2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]

#     # return the mean of eer from fpr and from fnr
#     eer = (eer_1 + eer_2) / 2
#     return eer


def compute_eer(distances, labels):
    # Calculate evaluation metrics
    fprs, tprs, _ = roc_curve(labels, distances)
    eer = fprs[np.nanargmin(np.absolute((1 - tprs) - fprs))]
    return eer

In [11]:
def compute_scores(train, speaker_df):
    emb_scores = []
    true_labels = []
    
    for i, item in speaker_df.iterrows():
        speaker_ids = item.spk_ids
        speaker_ids = speaker_ids.strip().split("-")
        speaker_ids = [int(x) for x in speaker_ids]
        
        # compute embedding
        
        gen_emb = compute_embedding(os.path.join("AfriSpeech-TTS/src/vits", item.audio_path))
        
        for speaker_id in speaker_ids:
            train_spk_df = train[train.user_ids_num == speaker_id]
            #pick the longest sample
            train_spk_df = train_spk_df.sort_values(by="duration", axis=0, ascending=False).reset_index(drop=True)
            ref_spk = train_spk_df[0:1].audio_paths.item()
            
            ref_spk_emb = speaker_emb_dict[Path(ref_spk).stem]

            score = gen_emb @ ref_spk_emb
            emb_scores.append(score)
            true_labels.append(0.)

        for i, item in train_spk_df.iterrows():
            if i == 0: continue
            
            spk_score = ref_spk_emb @ speaker_emb_dict[Path(item.audio_paths).stem]
            emb_scores.append(spk_score)
            true_labels.append(1.)
            if i > 4: break
            
    return np.array(emb_scores), np.array(true_labels)

In [23]:
speaker_data = pd.read_csv("AfriSpeech-TTS/src/vits/afritts_accent_interpolate/vits_afrotts_ft.txt",
                          sep="|")

emb_scores, true_labels = compute_scores(train, speaker_data)

In [24]:
compute_eer(distances=emb_scores, labels=true_labels)

# compute_eer(label=true_labels, pred=emb_scores, positive_label=1)

0.1619718309859155

In [94]:
emb_scores

array([0.8071718 , 0.80671734, 0.7872343 , 0.7897619 , 0.8950098 ,
       0.9652452 , 0.84654284, 0.9294492 , 0.847406  , 0.8987149 ,
       0.8091531 , 0.8771695 , 0.8742709 , 0.79306334, 0.82663244,
       0.883126  , 0.94675064, 0.8501967 , 0.91839015, 0.7022468 ,
       0.8744534 , 0.7522301 , 0.6231321 , 0.79781926, 0.7736032 ,
       0.73976034, 0.8032656 , 0.8236902 , 0.8798985 , 0.8746427 ,
       0.8073773 , 0.8717165 , 0.8382913 , 0.79134226, 0.7888961 ,
       0.860077  , 0.9477446 , 0.91521406, 0.8604313 , 0.94008124,
       0.9242471 , 0.94013894, 0.92661846, 0.9538962 , 0.9417337 ,
       0.80511904, 0.89117026, 0.8375808 , 0.6878051 , 0.79530144,
       0.94426274, 0.90570045, 0.9251994 , 0.9421374 , 0.9272267 ,
       0.90955234, 0.85227203, 0.9392668 , 0.8886725 , 0.9126222 ,
       0.9243428 , 0.8197731 , 0.83038634, 0.8212093 , 0.9446909 ,
       0.9030249 , 0.9502391 , 0.9244637 , 0.9672859 , 0.943462  ,
       0.95267177, 0.9720604 , 0.9586836 , 0.9715664 , 0.96121