## Notebook to evaluate TTS models

In [None]:
%load_ext autoreload
%autoreload 2

In [6]:
import scipy.stats as st
import numpy as np

import pandas as pd

In [9]:
# load csv files

# test_seen = pd.read_csv("/AfriSpeech-TTS/data/afritts-test-seen-clean.csv")
# test_unseen = pd.read_csv("/AfriSpeech-TTS/data/afritts-test-unseen-clean.csv")


In [3]:
# function to compute confidence interval

def compute_confidence_interval(score_arr, ci=0.95):
    # computes the CI at 95 perc confidence level
    
    mean_score = np.mean(score_arr)
    ci = st.t.interval(
            alpha=0.95,
            df=len(score_arr) - 1,
            loc=mean_score,
            scale=st.sem(score_arr),
        )
    
    return mean_score, mean_score-ci[0]

### mel-cepstral distance 

https://github.com/jasminsternkopf/mel_cepstral_distance

In [None]:
#!pip install mel-cepstral-distance --user

In [None]:
from mel_cepstral_distance import get_metrics_wavs, get_metrics_mels, get_metrics_mels_pairwise

In [None]:
mcd_arr = []

for wav_file_ref, wav_file_tts in path_to_wavs:
    mcd_audio, _, _ = get_metrics_wavs(wav_file_ref, wav_file_tts,)
    
    mcd_arr.append(mcd_audio)
    

mean_mcd, ci_mcd = compute_confidence_interval(mcd_arr)
#report the values, mean +/- ci
print(f"model mcd score: {mean_mcd} + {ci_mcd}")

### wv-mos

In [None]:
# pip install git+https://github.com/AndreevP/wvmos

In [None]:
# you might need a gpu to load the model
from wvmos import get_wvmos

wvmos_model = get_wvmos(cuda=True)

In [None]:
mos_array = []

for _, wav_file_tts in path_to_wavs:
    mos_audio = wvmos_model.calculate_one(wav_file_tts) # infer MOS score for one audio
    
    mos_array.append(mos_audio)
    
mean_mos, ci_mos = compute_confidence_interval(mos_array)
#report the values, mean +/- ci
print(f"model mcd score: {mean_mos} + {ci_mos}")

### cosine similarity

In [None]:
# pip install resemblyzer

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np

encoder = VoiceEncoder()

In [None]:
cos_sim_arr = [] 

for wav_file_ref, wav_file_tts in path_to_wavs:
        ref_wav = preprocess_wav(Path(wav_file_ref))
        gen_wav = preprocess_wav(Path(wav_file_tts))
        
        ref_emb = encoder.embed_utterance(ref_wav)
        gen_emb = encoder.embed_utterance(gen_wav)
        
        # the embeddings are already l2 normalized by the speaker model
        cos_sim = ref_emb @ gen_emb
        
        cos_sim_arr.append(cos_sim)
        
mean_cos_sim, ci_cos_sim = compute_confidence_interval(cos_sim_arr)
#report the values, mean +/- ci
print(f"model mcd score: {mean_cos_sim} + {ci_cos_sim}")

### WER: need a whisper model trained on african accent

In [None]:
#!pip install transformers
#!pip install jiwer

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "our_finetuned_whisper_model" #open_ai/whisper-medium-general

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


# create a custom dataset
# compute WER

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])



In [None]:
from jiwer import wer

# text_prediction_arrray contains [(reference_textA, predicted_textA), (reference_textB, predicted_textB)]

# pseudocode
wer_numerator, wer_denominator = [], []
for reference_text, predicted_text in text_prediction_arrray:
    wo_obj = jiwer.process_words(
        reference_text,
        predicted_text)

    total_error = wo_obj.substitutions + wo_obj.insertions + wo_obj.deletions
    wer_numerator.append(total_error)
    wer_denominator.append(len(reference_text.split()))

wer = sum(wer_numerator) / sum(wer_denominator)


### check if a TTSmodel is statistically better than another TTS model

In [None]:

# e.g., To verify that model 1 is better than model 2 in WV-mos scores

diff_in_scores = mos_score_array_of_model1 - mos_score_array_of_model2


mean_score, ci = compute_confidence_interval(diff_in_scores,)

# If the confidence intervals lie fully on the positive side on the real axis, 
# this means that the difference is statistically significant. 
# E.g., for WV-MOS, the confidence interval will be 0.14 +/- 0.xx. If xx is smaller than 14, 
# then the difference is statistically significant.

if mean_score - ci > 0:
    print("model A is better than model B")