## TTS model evaluation (VITS)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# !pip install 'transformers[torch]'
# !pip install --upgrade huggingface_hub

In [None]:
# Import required modules 

import os
from tqdm import tqdm

import librosa
import numpy as np
import pandas as pd
import scipy
from datasets import Dataset
from IPython.display import Audio

import torch
from transformers import (
    AutoTokenizer,
    VitsModel,
    set_seed,
)

### Dataset & Model Setup

In [None]:
# load csv files

DATA_ROOT = "../data"
test_seen = pd.read_csv(f"{DATA_ROOT}/afritts-test-seen-clean.csv")
test_unseen = pd.read_csv(f"{DATA_ROOT}/afritts-test-unseen-clean.csv")

In [None]:
# Create a custom HF dataset

# Convert DataFrame to a dictionary with lists
test_seen['age_group'] = test_seen['age_group'].astype(str).fillna('null')
test_seen_dict = test_seen.to_dict(orient='list')
test_unseen_dict = test_unseen.to_dict(orient='list')

# Create a Hugging Face Dataset
seen_dataset = Dataset.from_dict(test_seen_dict)
unseen_dataset = Dataset.from_dict(test_unseen_dict)

In [None]:
# select a subset of the seen dataset
seen_dataset_selected = seen_dataset.filter(lambda example: example['gender'] in ['Male', 'Female']).select(range(16))

In [None]:
# Load TTS models and tokenizers

def initialize_vits_model(model_id, device, torch_dtype='float32'):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = VitsModel.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True)
    model.to(device)
    return model, tokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32


# Initialize various VITS models
vits_vctk_model, vits_vctk_tokenizer = initialize_vits_model("facebook/mms-tts-eng", device, torch_dtype) # your_finetune_model_id
vits_afrotts_model, vits_afrotts_tokenizer = initialize_vits_model("facebook/mms-tts-eng", device, torch_dtype) # your_finetune_model_id
vits_afrotts_ft_model, vits_afrotts_ft_tokenizer = initialize_vits_model("facebook/mms-tts-eng", device, torch_dtype) # your_finetune_model_id
vits_afrotts_ft_ext_spk_model, vits_afrotts_ft_ext_spk_tokenizer = initialize_vits_model("facebook/mms-tts-eng", device, torch_dtype) # your_finetune_model_id



In [None]:
# Run Inference and Save Speech Utterances

def synthesize_save(model, tokenizer, transcripts, audio_paths, output_dir):
    set_seed(555)  # make deterministic
    os.makedirs(output_dir, exist_ok=True)
    synthesized_speech = []
    for i, (text, path) in enumerate(zip(transcripts, audio_paths)):
        # Extract the file name from the ground truth path
        gt_file_name = os.path.basename(path)
        # Create the file path for the synthesized file using the ground truth file name
        file_path = os.path.join(output_dir, gt_file_name)
        if os.path.isfile(file_path):
            print(f"File {file_path} already exists. Loading...")
            synthesized_speech.append(file_path)
        else:
            inputs = tokenizer(text, return_tensors="pt")
            with torch.no_grad():
                outputs = model(**inputs)
            audio = outputs.waveform[0].numpy()
            scipy.io.wavfile.write(file_path, rate=model.config.sampling_rate, data=audio)
            synthesized_speech.append(file_path)
    return synthesized_speech

output_dir = f"{DATA_ROOT}/AfriSpeech-TTS-D/tts_generated_speech/"


# Run for seen dataset 
transcript_seen = [example['transcript'] for example in seen_dataset]
audio_path_seen = [f"{DATA_ROOT}" + example['audio_paths'] for example in seen_dataset]
vits_vctk_audio_path_seen = synthesize_save(vits_vctk_model, vits_vctk_tokenizer, transcript_seen, audio_path_seen, f"{output_dir}/afritts_test_seen/vits_vctk")
vits_afrotts_audio_path_seen = synthesize_save(vits_afrotts_model, vits_afrotts_tokenizer, transcript_seen, audio_path_seen, f"{output_dir}/afritts_test_seen/vits_afrotts")
vits_afrotts_ft_audio_path_seen = synthesize_save(vits_afrotts_ft_model, vits_afrotts_ft_tokenizer, transcript_seen, audio_path_seen, f"{output_dir}/afritts_test_seen/vits_afrotts_ft")
vits_afrotts_ft_ext_spk_audio_path_seen = synthesize_save(vits_afrotts_ft_ext_spk_model, vits_afrotts_ft_ext_spk_tokenizer, transcript_seen, audio_path_seen, f"{output_dir}/afritts_test_seen/vits_afrotts_ft_ext_spk")

# Run for unseen dataset
transcript_unseen = [example['transcript'] for example in unseen_dataset]
audio_path_unseen = [f"{DATA_ROOT}" + example['audio_paths'] for example in unseen_dataset]
vits_afrotts_ft_ext_spk_audio_path_unseen = synthesize_save(vits_afrotts_ft_ext_spk_model, vits_afrotts_ft_ext_spk_tokenizer, transcript_unseen, audio_path_unseen, f"{output_dir}/afritts_untest_seen/vits_afrotts_ft_ext_spk")

In [None]:
# explore what the speech utterances sound like 

import gradio as gr

def display_audio(audio_files, transcripts, idx):
    with gr.Blocks() as demo:
      with gr.Column():
          audio, label = audio_files[idx], transcripts[idx]
          output = gr.Audio(audio, label=label)    
    demo.launch(debug=False)

# select a random example
idx = np.random.randint(0, len(audio_path_seen))

print("ground_truth: \n")
display_audio(audio_path_seen, transcript_seen, idx)

print("vits_vctk: \n")
display_audio(vits_vctk_audio_path_seen, transcript_seen, idx)

print("vits_afrotts: \n")
display_audio(vits_afrotts_audio_path_seen, transcript_seen, idx)

print("vits_afrotts_ft: \n")
display_audio(vits_afrotts_ft_audio_path_seen, transcript_seen, idx)

print("vits_afrotts_ft_ext_spk: \n")
display_audio(vits_afrotts_ft_ext_spk_audio_path_seen, transcript_seen, idx)

### Word Error Rate (Intelligibility)

In [None]:
# Load ASR model

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

asr_model_id = "Seyfelislem/afrispeech_large_A100"

asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    asr_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True)
asr_model.to(device)

processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
)

In [None]:
# Transcribe speech utterances using ASR model

def transcribe(model, audio_files):
    transcriptions = []
    for audio_path in tqdm(audio_files, desc="Transcribing audio files"):
        audio, sampling_rate = librosa.load(audio_path)
        result = model(audio)
        transcriptions.append(result["text"])
    return transcriptions

# Run for seen dataset
gt_transcripts_seen = transcribe(asr_pipe, audio_path_seen)
vits_vctk_transcripts_seen = transcribe(asr_pipe, vits_vctk_audio_path_seen)
vits_afrotts_transcripts_seen = transcribe(asr_pipe, vits_afrotts_audio_path_seen)
vits_afrotts_ft_transcripts_seen = transcribe(asr_pipe, vits_afrotts_ft_audio_path_seen)
vits_afrotts_ft_ext_spk_transcripts_seen = transcribe(asr_pipe, vits_afrotts_ft_ext_spk_audio_path_seen)

# Run for unseen dataset
gt_transcripts_unseen = transcribe(asr_pipe, audio_path_unseen)
vits_afrotts_ft_ext_spk_transcripts_unseen = transcribe(asr_pipe, vits_afrotts_ft_ext_spk_audio_path_unseen)

In [None]:
# Compute WER for the transcriptions

import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

def compute_normalized_wer(predictions, ground_truth):
    wer_metric = evaluate.load("wer")
    
    # Normalize predictions and ground truth
    normalizer = BasicTextNormalizer()
    predictions_norm = [normalizer(pred) for pred in predictions]
    references_norm = [normalizer(label) for label in ground_truth]

    # Compute WER
    wer = wer_metric.compute(references=references_norm, predictions=predictions_norm)

    return wer

# Run for seen dataset
gt_wer_seen = compute_normalized_wer(gt_transcripts_seen, transcript_seen)
vits_vctk_wer_seen = compute_normalized_wer(vits_vctk_transcripts_seen, transcript_seen)
vits_afrotts_wer_seen = compute_normalized_wer(vits_afrotts_transcripts_seen, transcript_seen)
vits_afrotts_ft_wer_seen = compute_normalized_wer(vits_afrotts_ft_transcripts_seen, transcript_seen)
vits_afrotts_ft_ext_spk_wer_seen = compute_normalized_wer(vits_afrotts_ft_ext_spk_transcripts_seen, transcript_seen)

# Run for unseen dataset
gt_wer_unseen = compute_normalized_wer(gt_transcripts_unseen, transcript_unseen)
vits_afrotts_ft_ext_spk_wer_unseen = compute_normalized_wer(vits_afrotts_ft_ext_spk_transcripts_unseen, transcript_unseen)

In [None]:
# Compile the results
results = {
    'Model': ['Ground Truth', 'VITS VCTK', 'VITS AfriTTS', 'VITS AfriTTS FT', 'VITS AfriTTS FT EXT SPK'],
    'WER Seen': [gt_wer_seen, vits_vctk_wer_seen, vits_afrotts_wer_seen, vits_afrotts_ft_wer_seen, vits_afrotts_ft_ext_spk_wer_seen],
    'WER Unseen': [gt_wer_unseen, None, None, None, vits_afrotts_ft_ext_spk_wer_unseen]
}
wer_results_df = pd.DataFrame(results)
display(wer_results_df)

### Mel-Cepstral-Distance (Speech Signal Similarity)
https://github.com/jasminsternkopf/mel_cepstral_distance

In [None]:
#!pip install mel-cepstral-distance --user

In [None]:
from pathlib import Path
from mel_cepstral_distance import get_metrics_wavs

def compute_mcd(ref_audio_path, synth_audio_path):
    mcd_arr = []
    for ref, synth in zip(ref_audio_path, synth_audio_path):
        mcd_audio, _, _ = get_metrics_wavs(Path(ref), Path(synth), use_dtw=False)
        mcd_arr.append(mcd_audio)
    return mcd_arr
    

# Compute MCD
vits_afrotts_mcd_seen = compute_mcd(audio_path_seen, vits_afrotts_audio_path_seen)
vits_afrotts_ft_mcd_seen = compute_mcd(audio_path_seen, vits_afrotts_ft_audio_path_seen)
vits_afrotts_ft_ext_spk_mcd_seen = compute_mcd(audio_path_seen, vits_afrotts_ft_ext_spk_audio_path_seen)
vits_afrotts_ft_ext_spk_mcd_unseen = compute_mcd(audio_path_unseen, vits_afrotts_ft_ext_spk_audio_path_unseen)

In [None]:
import scipy.stats as st

# function to compute confidence interval

def compute_confidence_interval(score_arr, ci=0.95):
    # computes the CI at 95 perc confidence level

    # Filter out NaN values
    score_arr = np.array(score_arr)
    clean_arr = score_arr[~np.isnan(score_arr)]
    mean_score = np.nanmean(clean_arr)
    ci = st.t.interval(
            confidence=ci,
            df=len(clean_arr) - 1,
            loc=mean_score,
            scale=st.sem(clean_arr) if np.std(clean_arr) > 0 else 0,
        )
    
    return mean_score, mean_score-ci[0]


# Compute the mean and confidence interval for each MCD array
mean_ci_vits_afrotts_seen, ci_vits_afrotts_seen = compute_confidence_interval(vits_afrotts_mcd_seen)
mean_ci_vits_afrotts_ft_seen, ci_vits_afrotts_ft_seen = compute_confidence_interval(vits_afrotts_ft_mcd_seen)
mean_ci_vits_afrotts_ft_ext_spk_seen, ci_vits_afrotts_ft_ext_spk_seen = compute_confidence_interval(vits_afrotts_ft_ext_spk_mcd_seen)
mean_ci_vits_afrotts_ft_ext_spk_unseen, ci_vits_afrotts_ft_ext_spk_unseen = compute_confidence_interval(vits_afrotts_ft_ext_spk_mcd_unseen)

In [None]:
data = {
    'Model': [
        'VITS AfriTTS Seen',
        'VITS AfriTTS FT Seen',
        'VITS AfriTTS FT EXT SPK Seen',
        'VITS AfriTTS FT EXT SPK Unseen'
    ],
    'Mean MCD': [
        mean_ci_vits_afrotts_seen,
        mean_ci_vits_afrotts_ft_seen,
        mean_ci_vits_afrotts_ft_ext_spk_seen,
        mean_ci_vits_afrotts_ft_ext_spk_unseen
    ],
    'Confidence Interval': [
        ci_vits_afrotts_seen,
        ci_vits_afrotts_ft_seen,
        ci_vits_afrotts_ft_ext_spk_seen,
        ci_vits_afrotts_ft_ext_spk_unseen
    ]
}
mcd_ci_df = pd.DataFrame(data)
display(mcd_ci_df)

### Cosine Distance (Speaker Similarity)

In [None]:
# pip install resemblyzer

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np

encoder = VoiceEncoder()

def compute_cos_sim(encoder, ref_audio_path, synth_audio_path):
    cos_sim_arr = [] 
    for ref, synth in zip(ref_audio_path, synth_audio_path):
            ref_wav = preprocess_wav(Path(ref))
            gen_wav = preprocess_wav(Path(synth))
            
            ref_emb = encoder.embed_utterance(ref_wav)
            gen_emb = encoder.embed_utterance(gen_wav)
            
            # the embeddings are already l2 normalized by the speaker model
            cos_sim = ref_emb @ gen_emb
            
            cos_sim_arr.append(cos_sim)
    return cos_sim_arr

# Compute Cosine Similarity
vits_afrotts_cos_sim_seen = compute_cos_sim(encoder, audio_path_seen, vits_afrotts_audio_path_seen)
vits_afrotts_ft_cos_sim_seen = compute_cos_sim(encoder, audio_path_seen, vits_afrotts_ft_audio_path_seen)
vits_afrotts_ft_ext_spk_cos_sim_seen = compute_cos_sim(encoder, audio_path_seen, vits_afrotts_ft_ext_spk_audio_path_seen)
vits_afrotts_ft_ext_spk_cos_sim_unseen = compute_cos_sim(encoder, audio_path_unseen, vits_afrotts_ft_ext_spk_audio_path_unseen)

In [None]:
# Compute mean and confidence interval for each array
mean_ci_afrotts, ci_afrotts = compute_confidence_interval(vits_afrotts_cos_sim_seen)
mean_ci_afrotts_ft, ci_afrotts_ft = compute_confidence_interval(vits_afrotts_ft_cos_sim_seen)
mean_ci_afrotts_ft_ext_spk_seen, ci_afrotts_ft_ext_spk_seen = compute_confidence_interval(vits_afrotts_ft_ext_spk_cos_sim_seen)
mean_ci_afrotts_ft_ext_spk_unseen, ci_afrotts_ft_ext_spk_unseen = compute_confidence_interval(vits_afrotts_ft_ext_spk_cos_sim_unseen)


In [None]:
data = {
    'Model': [
        'VITS AfriTTS Seen',
        'VITS AfriTTS FT Seen',
        'VITS AfriTTS FT EXT SPK Seen',
        'VITS AfriTTS FT EXT SPK Unseen'
    ],
    'Mean Cosine Similarity': [
        mean_ci_afrotts,
        mean_ci_afrotts_ft,
        mean_ci_afrotts_ft_ext_spk_seen,
        mean_ci_afrotts_ft_ext_spk_unseen
    ],
    'Confidence Interval (CI)': [
        ci_afrotts,
        ci_afrotts_ft,
        ci_afrotts_ft_ext_spk_seen,
        ci_afrotts_ft_ext_spk_unseen
    ]
}
cos_sim_results_df = pd.DataFrame(data)
display(cos_sim_results_df)

### WV-MOS (Overall quality)

https://github.com/AndreevP/wvmos

In [None]:
# pip install git+https://github.com/AndreevP/wvmos

In [None]:
# =======================
# ATTENTION
#
# you need a gpu to load the model
# =======================
from wvmos import get_wvmos

wvmos_model = get_wvmos(cuda=True)

In [None]:
# mos_array = []

# for _, wav_file_tts in path_to_wavs:
#     mos_audio = wvmos_model.calculate_one(wav_file_tts) # infer MOS score for one audio
    
#     mos_array.append(mos_audio)
    
# mean_mos, ci_mos = compute_confidence_interval(mos_array)
# #report the values, mean +/- ci
# print(f"model mcd score: {mean_mos} + {ci_mos}")

### check if a TTSmodel is statistically better than another TTS model

In [None]:

# # e.g., To verify that model 1 is better than model 2 in WV-mos scores

# diff_in_scores = mos_score_array_of_model1 - mos_score_array_of_model2


# mean_score, ci = compute_confidence_interval(diff_in_scores,)

# # If the confidence intervals lie fully on the positive side on the real axis, 
# # this means that the difference is statistically significant. 
# # E.g., for WV-MOS, the confidence interval will be 0.14 +/- 0.xx. If xx is smaller than 14, 
# # then the difference is statistically significant.

# if mean_score - ci > 0:
#     print("model A is better than model B")