## TTS model evaluation (VITS)

In [None]:
!pip install awscli
!pip install --upgrade huggingface_hub

!pip install 'transformers[torch]'
!pip install 'datasets[audio]'

!pip install gradio
!pip install --upgrade evaluate jiwer
# !pip install mel-cepstral-distance --user
!pip install resemblyzer
!pip install git+https://github.com/AndreevP/wvmos
!git clone https://github.com/gabrielmittag/NISQA.git

In [None]:
# Import required modules

import os
from tqdm import tqdm

import librosa
import numpy as np
import pandas as pd
import scipy
from datasets import Dataset
from IPython.display import Audio

import torch
from transformers import (
    AutoTokenizer,
    VitsModel,
    set_seed,
)

### Dataset & Model Setup

In [None]:
DATA_DIR = "." # change to your data root dir

In [None]:
# # Download the audio data (if not already downloaded)

# !aws configure
# !aws s3 cp s3://intron-open-source/AfriSpeech-TTS-D {DATA_DIR}/AfriSpeech-TTS-D/ --recursive
# !aws s3 cp s3://intron-open-source/AfriSpeech-TTS/tts_generated_speech {DATA_DIR}/AfriSpeech-TTS-D/tts_generated_speech/ --recursive

In [None]:
# load test_seen csv file
test_seen = pd.read_csv("https://raw.githubusercontent.com/intron-innovation/AfriSpeech-TTS/vits/data/afritts-test-seen-clean.csv")

In [None]:
def load_data(prefix_path, file):
    file_path = os.path.join(prefix_path, file + '.txt')
    df = pd.read_csv(file_path, sep='|', header=None)
    expected_columns = ['audio_path', 'text', 'country', 'accent', 'speaker_id', 'sentence_id']
    if not set(expected_columns).issubset(set(df.iloc[0])):
        df.columns = expected_columns
    else:
        df = pd.read_csv(file_path, sep='|')

    # Append the specified prefix to each entry in the 'audio_path' column
    df['audio_path'] = df['audio_path'].apply(lambda x: os.path.join(prefix_path, os.path.basename(os.path.dirname(x)), os.path.basename(x)))
    return df


def extract_audio_texts(df):
    # Extract 'audio_path' and 'text' columns, convert to a list of tuples
    audio_texts = list(df[['audio_path', 'text']].itertuples(index=False, name=None))
    return audio_texts

TTS_DIR = f"{DATA_DIR}/AfriSpeech-TTS-D/tts_generated_speech/afritts_test_seen"
tts_generated_folders = [
    'vits_afrotts',
    'vits_afrotts_ft',
    'vits_afrotts_ft_ext_spk',
    'vits_vctk',
    'xtts',
    'xtts_ft'
]

audio_texts_dict = {}

for tts_model in tts_generated_folders:
    df = load_data(TTS_DIR, tts_model) 
    audio_texts = extract_audio_texts(df) 
    audio_texts_dict[file.split('.')[0]] = audio_texts  

In [None]:
# explore what the speech utterances sound like

import gradio as gr

def display_audio(audio_texts_dict, key, indices):
    with gr.Blocks() as demo:
      with gr.Column():
          for idx in indices:
              audio, label = audio_texts_dict[key][idx][0], audio_texts_dict[key][idx][1]
              output = gr.Audio(audio, label=label)
    demo.launch(debug=False)

# select random examples
indices = np.random.choice(700, 1, replace=False)
print(indices)

for key in audio_texts_dict.keys():
  print(f"Displaying audio for {key}")
  display_audio(audio_texts_dict, key, indices)

### Word Error Rate (Intelligibility)

In [None]:
# Load ASR model

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

asr_model_id = "openai/whisper-large-v3"

asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    asr_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True)
asr_model.to(device)

processor = AutoProcessor.from_pretrained(asr_model_id)

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "english"}
)

In [None]:
from joblib import Parallel, delayed

def load_audio_file(audio_path, sampling_rate=16000):
    audio, _ = librosa.load(audio_path, sr=sampling_rate)
    return audio


# Transcribe speech utterances using ASR model

def transcribe(model, audio_texts_tuples, batch_size=16):
    transcriptions = []
    audio_paths = [audio_text[0] for audio_text in audio_texts_tuples]
    all_audio_data = Parallel(n_jobs=-1)(
        delayed(load_audio_file)(path) for path in tqdm(audio_paths, desc="Loading audio files")
    )
    for i in tqdm(range(0, len(all_audio_data), batch_size), desc="Transcribing batches"):
        batch_audio_data = all_audio_data[i:i+batch_size]
        results = model(batch_audio_data)
        batch_transcriptions = [result["text"] for result in results]
        transcriptions.extend(batch_transcriptions)
    return transcriptions

transcriptions_dict = {}

for key in audio_texts_dict.keys():
  transcriptions_dict[key] = transcribe(asr_pipe, audio_texts_dict[key])


In [None]:
# Compute WER for the transcriptions

import evaluate
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

def compute_normalized_wer(predictions, ground_truth):
    wer_metric = evaluate.load("wer")

    # Normalize predictions and ground truth
    normalizer = BasicTextNormalizer()
    predictions_norm = [normalizer(pred) for pred in predictions]
    references_norm = [normalizer(label) for label in ground_truth]

    # Compute WER
    wer = wer_metric.compute(references=references_norm, predictions=predictions_norm)

    return wer

wer_dict = {}

# Compute WER for each key in the dictionary
for key in transcriptions_dict.keys():
  ground_truth_list = [text for _, text in audio_texts_dict[key]]
  predictions_list = transcriptions_dict[key]
  wer_dict[key] = compute_normalized_wer(predictions_list, ground_truth_list)

wer_df = pd.DataFrame(list(wer_dict.items()), columns=['Model', 'WER'])
display(wer_df)

### Mel-Cepstral-Distance (Speech Signal Similarity)
https://github.com/jasminsternkopf/mel_cepstral_distance

In [None]:
# from pathlib import Path
# from mel_cepstral_distance import get_metrics_wavs

# def compute_mcd(ref_audio_path, synth_audio_path):
#     mcd_arr = []
#     for ref, synth in zip(ref_audio_path, synth_audio_path):
#         mcd_audio, _, _ = get_metrics_wavs(Path(ref), Path(synth), use_dtw=False)
#         mcd_arr.append(mcd_audio)
#     return mcd_arr

In [None]:
import scipy.stats as st

# function to compute confidence interval

def compute_confidence_interval(score_arr, ci=0.95):
    # computes the CI at 95 perc confidence level

    # Filter out NaN values
    score_arr = np.array(score_arr)
    clean_arr = score_arr[~np.isnan(score_arr)]
    mean_score = np.nanmean(clean_arr)
    ci = st.t.interval(
            confidence=ci,
            df=len(clean_arr) - 1,
            loc=mean_score,
            scale=st.sem(clean_arr) if np.std(clean_arr) > 0 else 0,
        )

    return mean_score, mean_score-ci[0]

### Cosine Distance (Speaker Similarity)
https://github.com/resemble-ai/Resemblyzer

In [None]:
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np


def compute_cos_sim(encoder, ref_audio_path, synth_audio_path):
    cos_sim_arr = []
    for ref, synth in zip(ref_audio_path, synth_audio_path):
            ref_wav = preprocess_wav(Path(ref))
            gen_wav = preprocess_wav(Path(synth))

            ref_emb = encoder.embed_utterance(ref_wav)
            gen_emb = encoder.embed_utterance(gen_wav)

            # the embeddings are already l2 normalized by the speaker model
            cos_sim = ref_emb @ gen_emb

            cos_sim_arr.append(cos_sim)
    return cos_sim_arr

encoder = VoiceEncoder()

cos_sim_dict = {}

# Compute cosine similarity for each key in the dictionary
for key in audio_texts_dict.keys():
  synth_audio_path = [path for path, _ in audio_texts_dict[key]]
  ref_audio_path = [f"{DATA_DIR}{path}" for path in test_seen['audio_paths']]
  cos_sim_dict[key] = compute_cos_sim(encoder, ref_audio_path, synth_audio_path)

In [None]:
ci_results = {}

# Compute the confidence interval for each model
for model, scores in cos_sim_dict.items():
    mean_score, ci = compute_confidence_interval(scores)
    ci_results[model] = {'Mean Cosine Similarity': mean_score, 'Confidence Interval': ci}

ci_df = pd.DataFrame.from_dict(ci_results, orient='index')
display(ci_df)

### WV-MOS (Overall quality)

https://github.com/AndreevP/wvmos

In [None]:
# =======================
# you need a gpu to load the model
# =======================
from wvmos import get_wvmos

wvmos_model = get_wvmos(cuda=True)

In [None]:
def compute_mos_scores(audio_paths, model):
    mos_scores = []
    for wav_file_path in audio_paths:
        # Infer MOS score for one audio file
        mos_score = model.calculate_one(wav_file_path)
        mos_scores.append(mos_score)

    return mos_scores


wvmos_dict = {}

# Compute WV-MOS for each key in the dictionary
for key in audio_texts_dict.keys():
  audio_paths = [path for path, _ in audio_texts_dict[key]]
  wvmos_dict[key] = compute_mos_scores(audio_paths, wvmos_model)

In [None]:
ci_results = {}

# Compute the confidence interval for each model
for model, scores in wvmos_dict.items():
    mean_score, ci = compute_confidence_interval(scores)
    ci_results[model] = {'Mean WV-MOS': mean_score, 'Confidence Interval': ci}

ci_df = pd.DataFrame.from_dict(ci_results, orient='index')
display(ci_df)

### NISQA (Speech Quality and Naturalness Assessment)
https://github.com/gabrielmittag/NISQA

In [None]:
import subprocess

output_csv = f"{DATA_DIR}/NISQA_results.csv"

dfs = []

for tts_model in tts_generated_folders:
    dir = os.path.join(TTS_DIR, tts_model)
    command = f'python NISQA/run_predict.py --mode predict_dir --pretrained_model NISQA/weights/nisqa.tar --data_dir {dir} --num_workers 0 --bs 10  --output_dir {DATA_DIR}'
    subprocess.run(command, shell=True, check=True)
    df = pd.read_csv(output_csv)
    df['Model'] = os.path.basename(tts_model)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)


In [None]:
ci_results_df = pd.DataFrame()

score_columns = ['mos_pred', 'noi_pred', 'dis_pred', 'col_pred', 'loud_pred']

for model, group in combined_df.groupby('Model'):
    ci_results = {'Model': model}
    for col in score_columns:
        mean_score, ci = compute_confidence_interval(group[col])
        ci_results[f'{col}_mean'] = mean_score
        ci_results[f'{col}_ci'] = ci
    ci_results_df = ci_results_df.append(ci_results, ignore_index=True)

display(ci_results_df)

### check if a TTSmodel is statistically better than another TTS model

In [None]:
# # verify that model 1 is better than model 2 in WV-mos scores

# def compare_models(mos_scores_model1, mos_scores_model2, ci_level=0.95):
#     # Compute difference in scores
#     diff_in_scores = np.array(mos_scores_model1) - np.array(mos_scores_model2)

#     # Compute the confidence interval of the difference
#     mean_score, ci_half_width = compute_confidence_interval(diff_in_scores, ci=ci_level)
#     lower_bound = mean_score - ci_half_width
#     upper_bound = mean_score + ci_half_width

#     # If the confidence intervals lie fully on the positive side on the real axis,
#     # this means that the difference is statistically significant.
#     # E.g., for WV-MOS, the confidence interval will be 0.14 +/- 0.xx. If xx is smaller than 14,
#     # then the difference is statistically significant.

#     # Check if the confidence interval lies fully on the positive side
#     if lower_bound > 0:
#         print("Model 1 is statistically significantly better than Model 2")
#     elif upper_bound < 0:
#         print("Model 2 is statistically significantly better than Model 1")
#     else:
#         print("No statistically significant difference between Model 1 and Model 2")