## TTS model evaluation (VITS)

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
# Import required modules 

import os
from tqdm import tqdm

import gradio as gr
import librosa
import numpy as np
import pandas as pd
import scipy
from datasets import Dataset
from IPython.display import Audio

import torch
import evaluate
from transformers import (
    AutoTokenizer,
    VitsModel,
    set_seed,
)

### Data & Model Setup

In [5]:
# load csv files

DATA_ROOT = "../data"
test_seen = pd.read_csv(f"{DATA_ROOT}/afritts-test-seen-clean.csv")
test_unseen = pd.read_csv(f"{DATA_ROOT}/afritts-test-unseen-clean.csv")

In [None]:
%load_ext autoreload
%autoreload 2

In [6]:
# Create a custom HF dataset

# Convert DataFrame to a dictionary with lists
test_seen['age_group'] = test_seen['age_group'].astype(str).fillna('null')
test_seen_dict = test_seen.to_dict(orient='list')
test_unseen_dict = test_unseen.to_dict(orient='list')

# Create a Hugging Face Dataset
seen_dataset = Dataset.from_dict(test_seen_dict)
unseen_dataset = Dataset.from_dict(test_unseen_dict)

In [8]:
# select a subset of the seen dataset
seen_dataset_selected = seen_dataset.filter(lambda example: example['gender'] in ['Male', 'Female']).select(range(10))

Filter:   0%|          | 0/646 [00:00<?, ? examples/s]

In [9]:
# Load TTS models and tokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Initialize the base model
base_model_id = "facebook/mms-tts-eng"
base_model = VitsModel.from_pretrained(base_model_id, torch_dtype=torch_dtype, use_safetensors=True)
base_model.to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Initialize the finetuned model
# finetuned_model_id = "your_finetuned_model_id_here"
# finetuned_model = VitsModel.from_pretrained(finetuned_model_id)
# finetuned_model.to(device)

Some weights of the model checkpoint at facebook/mms-tts-eng were not used when initializing VitsModel: ['posterior_encoder.wavenet.res_skip_layers.8.weight_v', 'posterior_encoder.wavenet.in_layers.12.weight_g', 'posterior_encoder.wavenet.res_skip_layers.2.weight_v', 'flow.flows.2.wavenet.in_layers.1.weight_g', 'posterior_encoder.wavenet.res_skip_layers.1.weight_g', 'flow.flows.3.wavenet.res_skip_layers.0.weight_g', 'flow.flows.3.wavenet.res_skip_layers.1.weight_v', 'flow.flows.2.wavenet.res_skip_layers.3.weight_g', 'flow.flows.3.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.res_skip_layers.8.weight_g', 'posterior_encoder.wavenet.res_skip_layers.11.weight_v', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'posterior_encoder.wavenet.res_skip_layers.13.weight_v', 'flow.flows.3.wavenet.in_layers.2.weight_v', 'posterior_encoder.wavenet.in_layers.9.weight_g', 'flow.flows.1.wavenet.in_layers.1.weight_g', 'flow.flows.1.wa

In [11]:
# Run Inference and Save Speech Utterances

def synthesize_save(model, tokenizer, texts, output_dir, prefix=""):
    set_seed(555)  # make deterministic
    os.makedirs(output_dir, exist_ok=True)
    synthesized_speech = []
    for i, text in tqdm(enumerate(texts), total=len(texts), desc=f"Synthesizing speech - {prefix}"):
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        audio = outputs.waveform[0].numpy()
        file_path = os.path.join(output_dir, f"{prefix}_utterance_{i}.wav")
        scipy.io.wavfile.write(file_path, rate=model.config.sampling_rate, data=audio)
        synthesized_speech.append(file_path)
    return synthesized_speech

output_dir = f"{DATA_ROOT}/AfriSpeech-TTS-D/tts_generated_speech/"

# Run for unseen dataset
gt_transcripts_unseen = [example['transcript'] for example in unseen_dataset]
gt_speech_unseen = [f"{DATA_ROOT}" + example['audio_paths'] for example in unseen_dataset]
base_speech_unseen = synthesize_save(base_model, tokenizer, gt_transcripts_unseen, output_dir, "base_unseen")
# ft_speech_unseen = synthesize_save(finetuned_model, tokenizer, gt_transcripts_unseen, output_dir, "ft_unseen")

# Run for seen dataset
gt_transcripts_seen = [example['transcript'] for example in seen_dataset_selected]
gt_speech_seen = [f"{DATA_ROOT}" + example['audio_paths'] for example in seen_dataset_selected]
base_speech_seen = synthesize_save(base_model, tokenizer, gt_transcripts_seen, output_dir, "base_seen")
# ft_speech_seen = synthesize_save(finetuned_model, tokenizer, gt_transcripts_seen, output_dir, "ft_seen")


Synthesizing speech - base_unseen: 100%|████████| 16/16 [00:24<00:00,  1.51s/it]
Synthesizing speech - base_seen: 100%|██████████| 10/10 [00:12<00:00,  1.26s/it]


In [39]:
def display_audio(audio_files, transcripts, n=1):
    with gr.Blocks() as demo:
      with gr.Column():
        for i in range(n):
            audio, label = audio_files[i], transcripts[i]
            output = gr.Audio(audio, label=label)    
    demo.launch(debug=False)
    
print("Ground Truth: \n")
display_audio(gt_speech_seen, gt_transcripts_seen)

print("VITS (eng) baseline: \n")
display_audio(base_speech_seen, gt_transcripts_seen)

# print("VITS (eng) FT: \n")
# display_audio(ft_speech_seen, gt_transcripts_seen)

Ground Truth: 

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


VITS (eng) baseline: 

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


### Word Error Rate (Intelligibility)

In [13]:
# Load ASR model (Whisper)

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

asr_model_id = "openai/whisper-medium" #open_ai/whisper-medium-general

asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    asr_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
asr_model.to(device)

processor = AutoProcessor.from_pretrained(asr_model_id)

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# Transcribe speech utterances using ASR 

def transcribe(model, audio_files):
    transcriptions = []
    for audio_path in tqdm(audio_files, desc="Transcribing audio files"):
        audio, sampling_rate = librosa.load(audio_path)
        result = model(audio)
        transcriptions.append(result["text"])
    return transcriptions

# Run for unseen dataset
asr_transcripts_unseen = transcribe(asr_pipe, gt_speech_unseen)
base_transcripts_unseen = transcribe(asr_pipe, base_speech_unseen)
# ft_transcripts_unseen = transcribe(asr_model, ft_speech_unseen)


# Run for seen dataset
asr_transcripts_seen = transcribe(asr_pipe, gt_speech_seen)
base_transcripts_seen = transcribe(asr_pipe, base_speech_seen)
# ft_transcripts_seen = transcribe(asr_model, ft_speech_seen)


Transcribing audio files: 100%|█████████████████| 16/16 [01:31<00:00,  5.72s/it]
Transcribing audio files: 100%|█████████████████| 16/16 [01:25<00:00,  5.32s/it]
Transcribing audio files: 100%|█████████████████| 10/10 [00:51<00:00,  5.11s/it]
Transcribing audio files: 100%|█████████████████| 10/10 [00:51<00:00,  5.12s/it]


In [21]:
# Compute WER for the Transcriptions

from transformers.models.whisper.english_normalizer import BasicTextNormalizer

def compute_normalized_wer(predictions, ground_truth):
    wer_metric = evaluate.load("wer")
    
    # Normalize predictions and ground truth
    normalizer = BasicTextNormalizer()
    predictions_norm = [normalizer(pred) for pred in predictions]
    references_norm = [normalizer(label) for label in ground_truth]

    # Compute WER
    wer = wer_metric.compute(references=references_norm, predictions=predictions_norm)

    return wer


# Run for unseen dataset
gt_wer_unseen = compute_normalized_wer(asr_transcripts_unseen, gt_transcripts_unseen)
base_wer_unseen = compute_normalized_wer(base_transcripts_unseen, gt_transcripts_unseen)
# ft_wer_unseen = compute_normalized_wer(ft_transcripts_unseen, gt_transcripts_unseen)

# Run for seen dataset
gt_wer_seen = compute_normalized_wer(asr_transcripts_seen, gt_transcripts_seen)
base_wer_seen = compute_normalized_wer(base_transcripts_seen, gt_transcripts_seen)
# ft_wer_seen = compute_normalized_wer(ft_transcripts_seen, gt_transcripts_seen)

In [22]:
# ft_wer_unseen, ft_wer_seen = 1.0, 1.0

# Compile the results
results = {
    "Utterance": ["Ground Truth", "VITS (eng) baseline", "VITS (eng) FT", "Ground Truth", "VITS (eng) baseline", "VITS (eng) FT"],
    "Dataset": ["afritts-test-unseen-clean", "afritts-test-unseen-clean", "afritts-test-unseen-clean", "afritts-test-seen-clean", "afritts-test-seen-clean", "afritts-test-seen-clean"],
    "WER": [gt_wer_unseen, base_wer_unseen, ft_wer_unseen, gt_wer_seen, base_wer_seen, ft_wer_seen]
}

df_results = pd.DataFrame(results)

# Display the DataFrame
df_results.head(6)

Unnamed: 0,Utterance,Dataset,WER
0,Ground Truth,afritts-test-unseen-clean,0.432665
1,VITS (eng) baseline,afritts-test-unseen-clean,0.39255
2,VITS (eng) FT,afritts-test-unseen-clean,1.0
3,Ground Truth,afritts-test-seen-clean,0.483146
4,VITS (eng) baseline,afritts-test-seen-clean,0.58427
5,VITS (eng) FT,afritts-test-seen-clean,1.0


### WV-MOS (Overall quality)

https://github.com/AndreevP/wvmos

In [None]:
# pip install git+https://github.com/AndreevP/wvmos

In [24]:
# function to compute confidence interval

import scipy.stats as st

def compute_confidence_interval(score_arr, ci=0.95):
    # computes the CI at 95 perc confidence level
    
    mean_score = np.mean(score_arr)
    ci = st.t.interval(
            alpha=0.95,
            df=len(score_arr) - 1,
            loc=mean_score,
            scale=st.sem(score_arr),
        )
    
    return mean_score, mean_score-ci[0]

In [None]:
# you need a gpu to load the model
from wvmos import get_wvmos

wvmos_model = get_wvmos(cuda=True)

In [None]:
mos_array = []

for _, wav_file_tts in path_to_wavs:
    mos_audio = wvmos_model.calculate_one(wav_file_tts) # infer MOS score for one audio
    
    mos_array.append(mos_audio)
    
mean_mos, ci_mos = compute_confidence_interval(mos_array)
#report the values, mean +/- ci
print(f"model mcd score: {mean_mos} + {ci_mos}")

### Mel-Cepstral-Distance (Speech Signal Similarity)
https://github.com/jasminsternkopf/mel_cepstral_distance

In [None]:
from mel_cepstral_distance import get_metrics_wavs, get_metrics_mels, get_metrics_mels_pairwise

mcd_arr = []

for wav_file_ref, wav_file_tts in path_to_wavs:
    mcd_audio, _, _ = get_metrics_wavs(wav_file_ref, wav_file_tts,)
    
    mcd_arr.append(mcd_audio)
    

mean_mcd, ci_mcd = compute_confidence_interval(mcd_arr)
#report the values, mean +/- ci
print(f"model mcd score: {mean_mcd} + {ci_mcd}")

### Cosine Distance (Speaker Similarity)

In [None]:
# pip install resemblyzer

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np

encoder = VoiceEncoder()

In [None]:
cos_sim_arr = [] 

for wav_file_ref, wav_file_tts in path_to_wavs:
        ref_wav = preprocess_wav(Path(wav_file_ref))
        gen_wav = preprocess_wav(Path(wav_file_tts))
        
        ref_emb = encoder.embed_utterance(ref_wav)
        gen_emb = encoder.embed_utterance(gen_wav)
        
        # the embeddings are already l2 normalized by the speaker model
        cos_sim = ref_emb @ gen_emb
        
        cos_sim_arr.append(cos_sim)
        
mean_cos_sim, ci_cos_sim = compute_confidence_interval(cos_sim_arr)
#report the values, mean +/- ci
print(f"model mcd score: {mean_cos_sim} + {ci_cos_sim}")

### check if a TTSmodel is statistically better than another TTS model

In [None]:

# e.g., To verify that model 1 is better than model 2 in WV-mos scores

diff_in_scores = mos_score_array_of_model1 - mos_score_array_of_model2


mean_score, ci = compute_confidence_interval(diff_in_scores,)

# If the confidence intervals lie fully on the positive side on the real axis, 
# this means that the difference is statistically significant. 
# E.g., for WV-MOS, the confidence interval will be 0.14 +/- 0.xx. If xx is smaller than 14, 
# then the difference is statistically significant.

if mean_score - ci > 0:
    print("model A is better than model B")