## text2mos straight eval

<span style="font-size: 200%; color: red;">First! check the changing "your config.yaml" </span>

## [seq] import

In [1]:
import os
import json
import yaml
import sys
import time
import copy
import IPython.display as ipd
import pprint
from pathlib import Path
from tqdm import tqdm


import numpy as np
import torch
import torchaudio
from librosa.filters import mel as librosa_mel_fn
#import matplotlib
#matplotlib.use("Agg")
import matplotlib.pyplot as plt
from scipy.io.wavfile import write


import toybox

In [2]:
def plot_audio(audio, samplerate, title='time-domain waveform'):
    """
    usage:
        # audio is [channel, time(num_frames)] ex.torch.Size([1, 68608])
        # audio[0,:]: list of 1ch audio data
        # audio.shape[1]: int value of 1ch audio data length
        audio, sample_rate = torchaudio.load(str(iwav_path))
        %matplotlib inline
        plot_audio(audio, sample_rate)
    """
    # transform to mono
    channel = 0
    audio = audio[channel,:].view(1,-1)
    # to numpy
    audio = audio.to('cpu').detach().numpy().copy()
    time = np.linspace(0., audio.shape[1]/samplerate, audio.shape[1])
    
    fig, ax = plt.subplots(figsize=(12,9))
    
    ax.plot(time, audio[0, :])
    ax.set_title(title, fontsize=20, y=-0.12)
    ax.tick_params(direction='in')
    #ax.set_xlim(0, 3)
    ax.set_xlabel('Time')
    ax.set_ylabel('Amp')
    #ax.legend()
    plt.tight_layout()
    fig.canvas.draw()
    plt.show()
    #fig.savefig('figure.png')
    plt.close(fig)
    return fig

def plot_mel(tensors:list, titles:list[str]):
    """
    usage:
        mel = mel_process(...)
        fig_mel = plot_mel([mel_groundtruth[0], mel_prediction[0]],
                            ['groundtruth', 'inferenced(model)'])

    """
    xlim = max([t.shape[1] for t in tensors])
    fig, axs = plt.subplots(nrows=len(tensors),
                            ncols=1,
                            figsize=(12, 9),
                            constrained_layout=True)

    if len(tensors) == 1:
        axs = [axs]
    
    for i in range(len(tensors)):
        im = axs[i].imshow(tensors[i],
                           aspect="auto",
                           origin="lower",
                           interpolation='none')
        #plt.colorbar(im, ax=axs[i])
        fig.colorbar(im, ax=axs[i])
        axs[i].set_title(titles[i])
        axs[i].set_xlim([0, xlim])
    fig.canvas.draw()
    #plt.show()
    #plt.close()
    plt.close(fig)  # fig.close() 
    return fig

def convert_phn_to_id(phonemes, phn2id):
    """
    phonemes: phonemes separated by ' '
    phn2id: phn2id dict
    """
    return [phn2id[x] for x in ['<bos>'] + phonemes.split(' ') + ['<eos>']]


def text2phnid(text, phn2id, language='en', add_blank=True):
    if language == 'en':
        from text import G2pEn
        word2phn = G2pEn()
        phonemes = word2phn(text)
        if add_blank:
            phonemes = ' <blank> '.join(phonemes)
        return phonemes, convert_phn_to_id(phonemes, phn2id)
    else:
        raise ValueError(
            'Language should be en (for English)!')

In [3]:
# 
# ckpt_file_dir: logs4model/<model_name>/<runtime_name>/ckpt/

info_models_e500 = [
    {
        "model_name": "gradtts",
        "config_path": "configs/config_gt_k3.yaml",
        "runtime_name": "run_gt_k3",
        "ckpt_filename": "gradtts_500_397001.pt"
    },
    {
        "model_name": "gradseptts",
        "config_path": "configs/config_sgt_k3.yaml",
        "runtime_name": "run_sgt_k3",
        "ckpt_filename": "gradseptts_500_397001.pt"
    },
    {
        "model_name": "gradtfktts",
        "config_path": "configs/config_tfk_k3.yaml",
        "runtime_name": "run_tfk_k3",
        "ckpt_filename": "gradtfktts_500_397001.pt"
    },
    {
        "model_name": "gradtfk5tts",
        "config_path": "configs/config_tfk_k5.yaml",
        "runtime_name": "run_tfk_k5",
        "ckpt_filename": "gradtfk5tts_500_397001.pt"
    },
    {
        "model_name": "gradtimektts",
        "config_path": "configs/config_timek_k3.yaml",
        "runtime_name": "run_timek_k3",
        "ckpt_filename": "gradtimektts_500_397001.pt"
    },
    {
        "model_name": "gradfreqktts",
        "config_path": "configs/config_freqk_k3.yaml",
        "runtime_name": "run_freqk_k3",
        "ckpt_filename": "gradfreqktts_500_397001.pt"
    }
]

info_models_e1000 = [
    {
        "model_name": "gradtts",
        "config_path": "configs/config_gt_k3.yaml",
        "runtime_name": "run_gt_k3",
        "ckpt_filename": "gradtts_1000_794002.pt"
    },
    {
        "model_name": "gradseptts",
        "config_path": "configs/config_sgt_k3.yaml",
        "runtime_name": "run_sgt_k3",
        "ckpt_filename": "gradseptts_1000_794002.pt"
    },
    {
        "model_name": "gradtfktts",
        "config_path": "configs/config_tfk_k3.yaml",
        "runtime_name": "run_tfk_k3",
        "ckpt_filename": "gradtfktts_1000_794002.pt"
    },
    {
        "model_name": "gradtfk5tts",
        "config_path": "configs/config_tfk_k5.yaml",
        "runtime_name": "run_tfk_k5",
        "ckpt_filename": "gradtfk5tts_1000_794002.pt"
    },
    {
        "model_name": "gradtimektts",
        "config_path": "configs/config_timek_k3.yaml",
        "runtime_name": "run_timek_k3",
        "ckpt_filename": "gradtimektts_1000_794002.pt"
    },
    {
        "model_name": "gradfreqktts",
        "config_path": "configs/config_freqk_k3.yaml",
        "runtime_name": "run_freqk_k3",
        "ckpt_filename": "gradfreqktts_1000_794002.pt"
    }
]



target_epoch = 1000
print(f'target_epoch: {target_epoch}')
if target_epoch == 500:
    info_models = copy.deepcopy(info_models_e500)
elif target_epoch == 1000:
    info_models = copy.deepcopy(info_models_e1000)
else:
    print('Do not supported')

target_epoch: 1000


you can choose

- <span style="font-size: 200%; color: red;">inference section</span>
- <span style="font-size: 200%; color: red;">json analysis section</span>

if you choose <span style="color: red;">inference section</span>,
you keep go ahead below process.

but you choose <span style="color: red;">json analysis section</span>,
you need to <span style="color: blue;">jump to the end this page</span>.

## [seq] check configuration

In [4]:
# First, please check changing <model_name>
config_yaml = 'configs/config_exp_mid.yaml'
config = toybox.load_yaml_and_expand_var('configs/config_exp_mid.yaml')

In [5]:
# for model
model_name = config['model_name']
if model_name == 'gradtts':
    choise_idx = 0
elif model_name == 'gradseptts':
    choise_idx = 1
elif model_name == 'gradtfktts':
    choise_idx = 2
elif model_name == 'gradtfk5tts':
    choise_idx = 3
elif model_name == 'gradtimektts':
    choise_idx = 4
elif model_name == 'gradfreqktts':
    choise_idx = 5
else:
    os._exit(os.EX_OK)
    print('Alart: carefully chose model ')
print(f'model_name: {model_name}')

# for runtime to load model
runtime_name = config['runtime_name']
print(f'runtime_name: {runtime_name}')
config_path4model = info_models[choise_idx]["config_path"]
runtime_name4model = info_models[choise_idx]["runtime_name"]
ckpt_dir = f'logs4model/{model_name}/{runtime_name4model}/ckpt'
ckpt_path = ckpt_dir + "/" + info_models[choise_idx]["ckpt_filename"]
print(f"ckpt_path: {ckpt_path}")

model_name: gradfreqktts
runtime_name: infer4mid
ckpt_path: logs4model/gradfreqktts/run_freqk_k3/ckpt/gradfreqktts_1000_794002.pt


In [6]:
# for audio params
n_mels: int = config['n_mels'] # 80
n_fft: int = config['n_fft'] # 1024
sample_rate: int = config['sample_rate'] # 22050
hop_size: int = config['hop_size'] # 256
win_size: int = config['win_size'] # 1024
f_min: int = config['f_min'] # 0
f_max: int = config['f_max'] # 8000
random_seed: int = config['random_seed'] # 1234
print(n_mels, n_fft, sample_rate, hop_size, win_size, f_min, f_max, random_seed)

80 1024 22050 256 1024 0 8000 1234


In [7]:
print(f"phn2id_path: {config['phn2id_path']}")
with open(config['phn2id_path']) as f:
    phn2id = json.load(f)

vocab_size = len(phn2id) + 1

phn2id_path: ./configs/phn2id.json


In [8]:
# for hifigan
# setting file paths
# from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS/hifi-gan
# https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing
HiFiGAN_CONFIG = './hifigan/official_pretrained/LJ_V2/config.json'
HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V2/generator_v2'

from hifigan import models, env

with open(HiFiGAN_CONFIG) as f:
    hifigan_hparams = env.AttrDict(json.load(f))

hifigan_randomseed = hifigan_hparams.seed
print(f'hifigan_randomseed: {hifigan_randomseed}')

hifigan_randomseed: 1234


## [seq] device setting

In [9]:
import os

print(f"all cpu at using device: {os.cpu_count()}")
print(f"Number of available CPU: {len(os.sched_getaffinity(0))}") # Number of available CPUs can also be obtained. ,use systemcall at linux.
print(f"GPU_name: {torch.cuda.get_device_name()}\nGPU avail: {torch.cuda.is_available()}\n")

all cpu at using device: 52
Number of available CPU: 4


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [10]:
DEVICE = 'cpu' # 'cuda' or 'cpu'
flag_avail_cuda = torch.cuda.is_available()
if flag_avail_cuda == True:
    print('avail cuda')
    print('use cuda')
elif flag_avail_cuda == False and DEVICE=='cpu':
    print('Not avail cuda')
    print('use cpu')
else:
    os._exit(os.EX_OK)

device = torch.device(DEVICE)
print(f'device: {device}')

# setting random_seed ==============
print(f'device: {random_seed}')
toybox.set_seed(random_seed)

# reprint target epoch for ckpt
print(f'check for target_epoch: {target_epoch}')

Not avail cuda
use cpu
device: cpu
device: 1234
check for target_epoch: 1000


In [11]:
print(str(torch.get_default_device()))

cpu


## [seq] setting path

In [12]:
# for test_dataset
# RESULT_DIR_PATH = RESULT_DIR_PATH
# RESULT_MEL_DIR_PATH = IMEL_DIR_PATH
# RESULT_WAV_DIR_PATH = IWAV_DIR_PATH
# RESULT_JSON_DIR_PATH = RESULT_JSON_PATH
test_ds_path = Path(config['test_datalist_path'])

# text2mel
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
#print(RESULT_MEL_DIR_PATH)

# mel2wav
#IMEL_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/mel')
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'

# for utmos
#IWAV_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/wav')
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_JSON_DIR_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME

if model_name == 'groundtruth':
    # for mel2wav
    #IMEL_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/mel/')
    RESULT_MEL_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/mel/')
    # for utmos
    #/result4eval/infer4PBL/groundtruth/cuda/
    #IWAV_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/wav/')
    RESULT_WAV_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/wav/')
    RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}')

# style for mid
#RESULTS_JSON_NAME = 'eval4mid.json'
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
#RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'
#RESULT_JSON_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME

RESULTS_JSON_NAME = 'eval4mid.json'
RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/{"e"+str(target_epoch)}')
RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'
RESULT_JSON_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME
RESULTS_JSON_NAME = 'eval4mid.json'
print(f'RESULT_DIR_PATH: {RESULT_DIR_PATH}')

RESULT_DIR_PATH: result4eval/infer4mid/gradfreqktts/cpu/e1000


In [14]:
# check path

# for text2mel
print('test_ds_path-----------------------------------------')
if test_ds_path.exists():
    print(f'Exists {str(test_ds_path)}')
    with open(config['test_datalist_path']) as j:
        test_ds_list = json.load(j)
    print(f'loaded {test_ds_path}')
else:
    print(f'No exist {test_ds_path}')

print('RESULT_DIR_PATH-------------------------------------------')
if RESULT_DIR_PATH.exists():
    print(f'Exists {RESULT_DIR_PATH}')
else:
    RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_DIR_PATH}')

print('RESULT_MEL_DIR_PATH-------------------------------------------')
if RESULT_MEL_DIR_PATH.exists():
    print(f'Exists {RESULT_MEL_DIR_PATH}')
else:
    RESULT_MEL_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_MEL_DIR_PATH}')

print('RESULT_WAV_DIR_PATH-------------------------------------------')
if RESULT_WAV_DIR_PATH.exists():
    print(f'Exists {RESULT_WAV_DIR_PATH}')
else:
    RESULT_WAV_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_WAV_DIR_PATH}')

print('RESULT_JSON_PATH-------------------------------------------')
if RESULT_JSON_PATH.exists():
    print(f'Exists {RESULT_JSON_PATH}')
else:
    #RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_JSON_PATH}')

test_ds_path-----------------------------------------
Exists configs/test_dataset.json
loaded configs/test_dataset.json
RESULT_DIR_PATH-------------------------------------------
Exists result4eval/infer4mid/gradfreqktts/cpu/e1000
RESULT_MEL_DIR_PATH-------------------------------------------
Exists result4eval/infer4mid/gradfreqktts/cpu/e1000/mel
RESULT_WAV_DIR_PATH-------------------------------------------
Exists result4eval/infer4mid/gradfreqktts/cpu/e1000/wav
RESULT_JSON_PATH-------------------------------------------
No exist result4eval/infer4mid/gradfreqktts/cpu/e1000/eval4mid.json


## [seq] load Model

In [15]:
# import models
from gradtts import GradTTS
from gradseptts import GradSepTTS
from gradtfktts import GradTFKTTS
from gradtfk5tts import GradTFKTTS as GradTFK5TTS
from gradtimektts import GradTimeKTTS
from gradfreqktts import GradFreqKTTS
from gradtfkfultts import GradTFKFULTTS

print(model_name)
print("[seq] loading Model")

print("loading diffusion-TTS ===================================")
N_STEP = 50
TEMP = 1.5

print('loading ', ckpt_path)
_, _, state_dict = torch.load(ckpt_path,
                            map_location=device)

#with open(config_path4model) as f:
#    config = yaml.load(f, yaml.SafeLoader)
config4model = toybox.load_yaml_and_expand_var(config_path4model)

print("[seq] Initializing diffusion-TTS...")
if model_name == "gradtts":
    model = GradTTS.build_model(config4model, vocab_size)
elif model_name == "gradseptts":
    model = GradSepTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfktts":
    model = GradTFKTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfk5tts":
    model = GradTFK5TTS.build_model(config4model, vocab_size)
elif model_name == "gradtfkfultts":
    model = GradTFKFULTTS.build_model(config4model, vocab_size)
elif model_name == "gradtimektts":
    model = GradTimeKTTS.build_model(config4model, vocab_size)
elif model_name == "gradfreqktts":
    model = GradFreqKTTS.build_model(config4model, vocab_size)
else:
    raise ValueError(f"Error: '{model_name}' is not supported")

model = model.to(device)
model.load_state_dict(state_dict)
print(f'Number of encoder + duration predictor parameters: {model.encoder.nparams/1e6}m')
print(f'Number of decoder parameters: {model.decoder.nparams/1e6}m')
print(f'Total parameters: {model.nparams/1e6}m')

print("loading HiFi-GAN ===================================")
"""
#setting file paths
# from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS/hifi-gan
# https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing
HiFiGAN_CONFIG = './hifigan/official_pretrained/LJ_V2/config.json'
HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V2/generator_v2'

from hifigan import models, env

with open(HiFiGAN_CONFIG) as f:
    hifigan_hparams = env.AttrDict(json.load(f))

hifigan_randomseed = hifigan_hparams.seed
print(f'hifigan_randomseed: {hifigan_randomseed}')
"""
# generator ===================
print("[seq] loading HiFiGAN")
vocoder = models.Generator(hifigan_hparams)

vocoder.load_state_dict(torch.load(
    HiFiGAN_ckpt, map_location=device)['generator'])
vocoder = vocoder.eval().to(device)
vocoder.remove_weight_norm()

print("loading UTMOS ===================================")
predictor_utmos = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)



gradfreqktts
[seq] loading Model
loading  logs4model/gradfreqktts/run_freqk_k3/ckpt/gradfreqktts_1000_794002.pt
[seq] Initializing diffusion-TTS...
Number of encoder + duration predictor parameters: 3.549137m
Number of decoder parameters: 2.013391m
Total parameters: 5.562528m
[seq] loading HiFiGAN




Removing weight norm...


Using cache found in /work/sora-sa/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


In [16]:
"""
ttext = test_ds_list[0]['text']
tphonemes, tphnid = text2phnid(ttext, phn2id, 'en')
print(tphonemes)
print(len(tphonemes))
print(len(tphnid))

tphnid_len = torch.tensor(len(tphnid), dtype=torch.long).unsqueeze(0).to(device)
tphnid = torch.tensor(tphnid).unsqueeze(0).to(device)
print(tphnid_len)
print(tphnid)
"""

"\nttext = test_ds_list[0]['text']\ntphonemes, tphnid = text2phnid(ttext, phn2id, 'en')\nprint(tphonemes)\nprint(len(tphonemes))\nprint(len(tphnid))\n\ntphnid_len = torch.tensor(len(tphnid), dtype=torch.long).unsqueeze(0).to(device)\ntphnid = torch.tensor(tphnid).unsqueeze(0).to(device)\nprint(tphnid_len)\nprint(tphnid)\n"

## [seq] infer

In [17]:
infer_data_num: int = 101 #len(test_ds_list) is 200
print(f'infer_data_num: {infer_data_num}')
print(f'RESULT_DIR_PATH: {RESULT_DIR_PATH}')
print(f'RESULT_MEL_DIR_PATH: {RESULT_MEL_DIR_PATH}')
print(f'RESULT_WAV_DIR_PATH: {RESULT_WAV_DIR_PATH}')
print(f'RESULT_JSON_PATH: {RESULT_JSON_PATH}')

infer_data_num: 101
RESULT_DIR_PATH: result4eval/infer4mid/gradfreqktts/cpu/e1000
RESULT_MEL_DIR_PATH: result4eval/infer4mid/gradfreqktts/cpu/e1000/mel
RESULT_WAV_DIR_PATH: result4eval/infer4mid/gradfreqktts/cpu/e1000/wav
RESULT_JSON_PATH: result4eval/infer4mid/gradfreqktts/cpu/e1000/eval4mid.json


In [18]:
#a_num = range(90,101)
#print(range(infer_data_num)[-1])
#print(a_num[-1])

In [19]:
eval_list = []


for i in tqdm(range(infer_data_num)):
    test_ds_filename = test_ds_list[i]['name']
    mel_npy_path = RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    synth_wav_path = RESULT_WAV_DIR_PATH / f"{test_ds_filename}.wav"
    print(f'test_ds_index_{i}: {test_ds_filename}')
    # [seq]text2mel =========================================================
    # load txt
    print('[seq]text2mel')
    text = test_ds_list[i]['text']
    phonemes, phnid = text2phnid(text, phn2id, 'en')
    phonemes_len_int = len(phonemes)
    phnid_len_int = len(phnid)
    print(f'phonemes_len: {phonemes_len_int}')
    print(f'phnid_len: {phnid_len_int}')
    phnid_len = torch.tensor(len(phnid), dtype=torch.long).unsqueeze(0).to(device)
    phnid = torch.tensor(phnid).unsqueeze(0).to(device)

    # [seq] synth speech
    # process text to mel
    # mel is [n_mels, n_frame]
    start_time = time.perf_counter()
    _, mel_prediction, _ = model.forward(phnid,
                                        phnid_len,
                                        n_timesteps=N_STEP,
                                        temperature=TEMP,
                                        solver='original')
    end_time = time.perf_counter()

    dt = end_time - start_time
    dt4mel = dt * 22050 / ( mel_prediction.shape[-1] * 256)
    print(f'{model_name} dt: {dt}')
    print(f'{model_name} RTF: {dt4mel}')
    
    # for save mel
    mel4save = mel_prediction.unsqueeze(0) # [batch, channel(freq), n_frame(time)] ex.[1, 80, 619]
    # save
    #mel_npy_path =  RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    #print(f'test_ds_index_{i}: {mel_npy_path}')
    np.save(mel_npy_path, mel4save.cpu().detach().numpy().copy())

    # [seq]mel2wav =========================================================
    print('[seq]mel2wav')
    x = np.load(mel_npy_path) # [1, n_mel, n_frame]
    x2audio = torch.FloatTensor(x).to(device)
    x2audio = x2audio.squeeze().unsqueeze(0)
    # x2audio is [1, n_mels, n_frames]
    assert x2audio.shape[0] == 1
    with torch.no_grad():
        # vocoder.forward(x).cpu() is torch.Size([1, 1, 167168])
        audio = (vocoder.forward(x2audio).cpu().squeeze().clamp(-1,1).numpy() * 32768).astype(np.int16)
    write(
        synth_wav_path,
        hifigan_hparams.sampling_rate,
        audio)

    # [seq]wav2utmos =========================================================
    print('[seq]wav2utmos')
    #iwav_path = RESULT_WAV_DIR_PATH / f"{filename}.wav"
    #wav, samplerate = torchaudio.load(iwav_path)
    wav, samplerate = torchaudio.load(synth_wav_path)
    score_utmos = predictor_utmos(wav, samplerate)
    score_utmos_float = score_utmos.item()
    print(f'utmos: {score_utmos_float}')
    #eval_dict = {'name': filename, 'path': str(iwav_path), 'utmos': score_float}
    #score_utmos_list.append(eval_dict)
    
    # path, テキスト文、phonimes, phonimes数, dt, RTF, utmos
    eval_dict = {
        'name': test_ds_filename,
        'phonemes_len': phonemes_len_int,
        'phnid_len': phnid_len_int,
        'dt': dt,
        'RTF4mel': dt4mel,
        'utmos': score_utmos_float
    }
    eval_list.append(eval_dict)
    

  0%|                                                                                                     | 0/101 [00:00<?, ?it/s]

test_ds_index_0: LJ045-0049
[seq]text2mel
phonemes_len: 1127
phnid_len: 215
gradfreqktts dt: 22.842133776284754
gradfreqktts RTF: 2.8848346417194306
[seq]mel2wav
[seq]wav2utmos


  1%|▉                                                                                          | 1/101 [00:48<1:21:15, 48.76s/it]

utmos: 3.8492660522460938
test_ds_index_1: LJ017-0027
[seq]text2mel
phonemes_len: 411
phnid_len: 79
gradfreqktts dt: 5.010633602272719
gradfreqktts RTF: 1.80577391033789
[seq]mel2wav
[seq]wav2utmos


  2%|█▊                                                                                           | 2/101 [00:56<40:20, 24.45s/it]

utmos: 4.283524513244629
test_ds_index_2: LJ023-0031
[seq]text2mel
phonemes_len: 223
phnid_len: 45
gradfreqktts dt: 3.5765119860880077
gradfreqktts RTF: 1.855753230733259
[seq]mel2wav


  3%|██▊                                                                                          | 3/101 [01:01<25:47, 15.79s/it]

[seq]wav2utmos
utmos: 3.367459297180176
test_ds_index_3: LJ032-0046
[seq]text2mel
phonemes_len: 650
phnid_len: 125
gradfreqktts dt: 7.788422550074756
gradfreqktts RTF: 1.5315952949231983
[seq]mel2wav
[seq]wav2utmos


  4%|███▋                                                                                         | 4/101 [01:13<23:08, 14.31s/it]

utmos: 3.553670644760132
test_ds_index_4: LJ030-0026
[seq]text2mel
phonemes_len: 1025
phnid_len: 195
gradfreqktts dt: 12.226165103726089
gradfreqktts RTF: 1.6251141766562998
[seq]mel2wav
[seq]wav2utmos


  5%|████▌                                                                                        | 5/101 [01:31<24:53, 15.56s/it]

utmos: 3.5788278579711914
test_ds_index_5: LJ020-0041
[seq]text2mel
phonemes_len: 1154
phnid_len: 219
gradfreqktts dt: 13.538582754321396
gradfreqktts RTF: 1.530336233456297
[seq]mel2wav
[seq]wav2utmos


  6%|█████▌                                                                                       | 6/101 [01:51<27:02, 17.08s/it]

utmos: 4.138723373413086
test_ds_index_6: LJ001-0070
[seq]text2mel
phonemes_len: 1137
phnid_len: 215
gradfreqktts dt: 11.898073677904904
gradfreqktts RTF: 1.4852384771161862
[seq]mel2wav
[seq]wav2utmos


  7%|██████▍                                                                                      | 7/101 [02:09<27:03, 17.27s/it]

utmos: 4.254480361938477
test_ds_index_7: LJ019-0334
[seq]text2mel
phonemes_len: 611
phnid_len: 117
gradfreqktts dt: 6.345174660906196
gradfreqktts RTF: 1.5055860588087726
[seq]mel2wav
[seq]wav2utmos


  8%|███████▎                                                                                     | 8/101 [02:18<22:59, 14.83s/it]

utmos: 3.996896982192993
test_ds_index_8: LJ022-0152
[seq]text2mel
phonemes_len: 387
phnid_len: 75
gradfreqktts dt: 5.020624415948987
gradfreqktts RTF: 1.5611570449525491
[seq]mel2wav
[seq]wav2utmos


  9%|████████▎                                                                                    | 9/101 [02:26<19:22, 12.63s/it]

utmos: 3.4685163497924805
test_ds_index_9: LJ050-0154
[seq]text2mel
phonemes_len: 896
phnid_len: 171
gradfreqktts dt: 10.18163893930614
gradfreqktts RTF: 1.5278278705608965
[seq]mel2wav
[seq]wav2utmos


 10%|█████████                                                                                   | 10/101 [02:41<20:07, 13.27s/it]

utmos: 3.658318042755127
test_ds_index_10: LJ016-0045
[seq]text2mel
phonemes_len: 1237
phnid_len: 235
gradfreqktts dt: 13.453371810726821
gradfreqktts RTF: 1.4875182948217187
[seq]mel2wav
[seq]wav2utmos


 11%|██████████                                                                                  | 11/101 [03:01<22:58, 15.32s/it]

utmos: 3.7614963054656982
test_ds_index_11: LJ036-0100
[seq]text2mel
phonemes_len: 802
phnid_len: 153
gradfreqktts dt: 9.975741758011281
gradfreqktts RTF: 1.5288944740057047
[seq]mel2wav
[seq]wav2utmos


 12%|██████████▉                                                                                 | 12/101 [03:16<22:33, 15.21s/it]

utmos: 3.7927186489105225
test_ds_index_12: LJ046-0016
[seq]text2mel
phonemes_len: 884
phnid_len: 167
gradfreqktts dt: 9.07360673788935
gradfreqktts RTF: 1.4886386054349714
[seq]mel2wav
[seq]wav2utmos


 13%|███████████▊                                                                                | 13/101 [03:30<21:43, 14.82s/it]

utmos: 4.318185806274414
test_ds_index_13: LJ048-0085
[seq]text2mel
phonemes_len: 1181
phnid_len: 223
gradfreqktts dt: 12.299596666824073
gradfreqktts RTF: 1.4921110613087083
[seq]mel2wav
[seq]wav2utmos


 14%|████████████▊                                                                               | 14/101 [03:48<23:13, 16.02s/it]

utmos: 4.168603897094727
test_ds_index_14: LJ050-0197
[seq]text2mel
phonemes_len: 282
phnid_len: 55
gradfreqktts dt: 3.6492992620915174
gradfreqktts RTF: 1.756002285464341
[seq]mel2wav
[seq]wav2utmos


 15%|█████████████▋                                                                              | 15/101 [03:54<18:31, 12.92s/it]

utmos: 4.066720962524414
test_ds_index_15: LJ050-0178
[seq]text2mel
phonemes_len: 709
phnid_len: 135
gradfreqktts dt: 7.979936700779945
gradfreqktts RTF: 1.555055184638343
[seq]mel2wav
[seq]wav2utmos


 16%|██████████████▌                                                                             | 16/101 [04:06<17:54, 12.64s/it]

utmos: 4.084699630737305
test_ds_index_16: LJ043-0079
[seq]text2mel
phonemes_len: 1445
phnid_len: 273
gradfreqktts dt: 20.113505525980145
gradfreqktts RTF: 1.7878563469421689
[seq]mel2wav
[seq]wav2utmos


 17%|███████████████▍                                                                            | 17/101 [04:35<24:24, 17.44s/it]

utmos: 4.227631092071533
test_ds_index_17: LJ050-0207
[seq]text2mel
phonemes_len: 737
phnid_len: 141
gradfreqktts dt: 8.318897793069482
gradfreqktts RTF: 1.561067677161476
[seq]mel2wav
[seq]wav2utmos


 18%|████████████████▍                                                                           | 18/101 [04:47<22:05, 15.97s/it]

utmos: 3.8534200191497803
test_ds_index_18: LJ034-0005
[seq]text2mel
phonemes_len: 1442
phnid_len: 273
gradfreqktts dt: 22.656556345988065
gradfreqktts RTF: 2.0938550639964326
[seq]mel2wav
[seq]wav2utmos


 19%|█████████████████▎                                                                          | 19/101 [05:18<27:50, 20.38s/it]

utmos: 3.801114320755005
test_ds_index_19: LJ031-0151
[seq]text2mel
phonemes_len: 1203
phnid_len: 227
gradfreqktts dt: 13.984741198830307
gradfreqktts RTF: 1.7110015504827785
[seq]mel2wav
[seq]wav2utmos


 20%|██████████████████▏                                                                         | 20/101 [05:38<27:30, 20.38s/it]

utmos: 4.110653877258301
test_ds_index_20: LJ023-0021
[seq]text2mel
phonemes_len: 564
phnid_len: 109
gradfreqktts dt: 6.217736491933465
gradfreqktts RTF: 1.5891724968371304
[seq]mel2wav
[seq]wav2utmos


 21%|███████████████████▏                                                                        | 21/101 [05:48<22:50, 17.13s/it]

utmos: 4.3260626792907715
test_ds_index_21: LJ015-0301
[seq]text2mel
phonemes_len: 780
phnid_len: 149
gradfreqktts dt: 9.3972298479639
gradfreqktts RTF: 1.5129155822693048
[seq]mel2wav
[seq]wav2utmos


 22%|████████████████████                                                                        | 22/101 [06:02<21:27, 16.30s/it]

utmos: 4.039696216583252
test_ds_index_22: LJ021-0153
[seq]text2mel
phonemes_len: 320
phnid_len: 63
gradfreqktts dt: 4.6432925700210035
gradfreqktts RTF: 1.7775104369611654
[seq]mel2wav


 23%|████████████████████▉                                                                       | 23/101 [06:10<17:43, 13.63s/it]

[seq]wav2utmos
utmos: 3.0826258659362793
test_ds_index_23: LJ014-0037
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradfreqktts dt: 5.056643248070031
gradfreqktts RTF: 1.6687467615532836
[seq]mel2wav


 24%|█████████████████████▊                                                                      | 24/101 [06:17<15:14, 11.88s/it]

[seq]wav2utmos
utmos: 3.8680949211120605
test_ds_index_24: LJ004-0200
[seq]text2mel
phonemes_len: 1097
phnid_len: 207
gradfreqktts dt: 14.699585995171219
gradfreqktts RTF: 1.6881555791329448
[seq]mel2wav
[seq]wav2utmos


 25%|██████████████████████▊                                                                     | 25/101 [06:39<18:33, 14.65s/it]

utmos: 3.784943103790283
test_ds_index_25: LJ049-0010
[seq]text2mel
phonemes_len: 471
phnid_len: 91
gradfreqktts dt: 5.317726229783148
gradfreqktts RTF: 1.5474010685008235
[seq]mel2wav
[seq]wav2utmos


 26%|███████████████████████▋                                                                    | 26/101 [06:47<16:02, 12.84s/it]

utmos: 3.745633125305176
test_ds_index_26: LJ008-0291
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradfreqktts dt: 4.488477427046746
gradfreqktts RTF: 1.62439153207689
[seq]mel2wav


 27%|████████████████████████▌                                                                   | 27/101 [06:54<13:39, 11.08s/it]

[seq]wav2utmos
utmos: 3.774350881576538
test_ds_index_27: LJ048-0221
[seq]text2mel
phonemes_len: 602
phnid_len: 115
gradfreqktts dt: 7.2188447611406446
gradfreqktts RTF: 1.5390579264305309
[seq]mel2wav
[seq]wav2utmos


 28%|█████████████████████████▌                                                                  | 28/101 [07:05<13:21, 10.98s/it]

utmos: 3.437954902648926
test_ds_index_28: LJ004-0157
[seq]text2mel
phonemes_len: 637
phnid_len: 121
gradfreqktts dt: 7.78364052483812
gradfreqktts RTF: 1.499836353228822
[seq]mel2wav
[seq]wav2utmos


 29%|██████████████████████████▍                                                                 | 29/101 [07:17<13:27, 11.22s/it]

utmos: 4.1889519691467285
test_ds_index_29: LJ013-0175
[seq]text2mel
phonemes_len: 681
phnid_len: 131
gradfreqktts dt: 7.640574256423861
gradfreqktts RTF: 1.4922996594577853
[seq]mel2wav
[seq]wav2utmos


 30%|███████████████████████████▎                                                                | 30/101 [07:29<13:30, 11.42s/it]

utmos: 3.7072415351867676
test_ds_index_30: LJ021-0100
[seq]text2mel
phonemes_len: 752
phnid_len: 143
gradfreqktts dt: 9.077215690165758
gradfreqktts RTF: 1.5512819782998122
[seq]mel2wav
[seq]wav2utmos


 31%|████████████████████████████▏                                                               | 31/101 [07:42<14:06, 12.10s/it]

utmos: 3.833601474761963
test_ds_index_31: LJ018-0132
[seq]text2mel
phonemes_len: 687
phnid_len: 131
gradfreqktts dt: 8.555574359372258
gradfreqktts RTF: 1.4887185497487239
[seq]mel2wav
[seq]wav2utmos


 32%|█████████████████████████████▏                                                              | 32/101 [07:55<14:13, 12.37s/it]

utmos: 4.087820053100586
test_ds_index_32: LJ023-0059
[seq]text2mel
phonemes_len: 1184
phnid_len: 225
gradfreqktts dt: 14.58811146626249
gradfreqktts RTF: 1.6318377527956975
[seq]mel2wav
[seq]wav2utmos


 33%|██████████████████████████████                                                              | 33/101 [08:17<17:05, 15.08s/it]

utmos: 3.064539909362793
test_ds_index_33: LJ003-0027
[seq]text2mel
phonemes_len: 964
phnid_len: 183
gradfreqktts dt: 11.615699242800474
gradfreqktts RTF: 1.650978291974464
[seq]mel2wav
[seq]wav2utmos


 34%|██████████████████████████████▉                                                             | 34/101 [08:34<17:28, 15.65s/it]

utmos: 3.7497384548187256
test_ds_index_34: LJ018-0133
[seq]text2mel
phonemes_len: 674
phnid_len: 129
gradfreqktts dt: 7.497025520075113
gradfreqktts RTF: 1.5338239748891798
[seq]mel2wav
[seq]wav2utmos


 35%|███████████████████████████████▉                                                            | 35/101 [08:45<15:50, 14.41s/it]

utmos: 4.331391334533691
test_ds_index_35: LJ033-0060
[seq]text2mel
phonemes_len: 596
phnid_len: 115
gradfreqktts dt: 7.337423372082412
gradfreqktts RTF: 1.522874485640222
[seq]mel2wav
[seq]wav2utmos


 36%|████████████████████████████████▊                                                           | 36/101 [08:56<14:33, 13.44s/it]

utmos: 3.6043527126312256
test_ds_index_36: LJ003-0299
[seq]text2mel
phonemes_len: 881
phnid_len: 167
gradfreqktts dt: 10.393578002694994
gradfreqktts RTF: 1.5303044535217991
[seq]mel2wav
[seq]wav2utmos


 37%|█████████████████████████████████▋                                                          | 37/101 [09:12<15:01, 14.08s/it]

utmos: 3.3767051696777344
test_ds_index_37: LJ011-0060
[seq]text2mel
phonemes_len: 1083
phnid_len: 205
gradfreqktts dt: 12.077137372922152
gradfreqktts RTF: 1.4903120471040778
[seq]mel2wav
[seq]wav2utmos


 38%|██████████████████████████████████▌                                                         | 38/101 [09:30<16:08, 15.37s/it]

utmos: 3.749478578567505
test_ds_index_38: LJ013-0240
[seq]text2mel
phonemes_len: 768
phnid_len: 147
gradfreqktts dt: 8.882225155830383
gradfreqktts RTF: 1.6038805742765654
[seq]mel2wav
[seq]wav2utmos


 39%|███████████████████████████████████▌                                                        | 39/101 [09:44<15:19, 14.82s/it]

utmos: 3.5965723991394043
test_ds_index_39: LJ047-0076
[seq]text2mel
phonemes_len: 1287
phnid_len: 245
gradfreqktts dt: 17.312665805220604
gradfreqktts RTF: 1.7140098823864687
[seq]mel2wav
[seq]wav2utmos


 40%|████████████████████████████████████▍                                                       | 40/101 [10:09<18:05, 17.80s/it]

utmos: 3.5964832305908203
test_ds_index_40: LJ041-0133
[seq]text2mel
phonemes_len: 1018
phnid_len: 193
gradfreqktts dt: 11.71084459591657
gradfreqktts RTF: 1.5145465192142944
[seq]mel2wav
[seq]wav2utmos


 41%|█████████████████████████████████████▎                                                      | 41/101 [10:26<17:43, 17.72s/it]

utmos: 3.6059486865997314
test_ds_index_41: LJ038-0264
[seq]text2mel
phonemes_len: 1116
phnid_len: 211
gradfreqktts dt: 12.455464106053114
gradfreqktts RTF: 1.5800061184788703
[seq]mel2wav
[seq]wav2utmos


 42%|██████████████████████████████████████▎                                                     | 42/101 [10:44<17:26, 17.74s/it]

utmos: 3.702253818511963
test_ds_index_42: LJ011-0016
[seq]text2mel
phonemes_len: 609
phnid_len: 117
gradfreqktts dt: 7.343712323810905
gradfreqktts RTF: 1.6867589243753174
[seq]mel2wav
[seq]wav2utmos


 43%|███████████████████████████████████████▏                                                    | 43/101 [10:56<15:23, 15.92s/it]

utmos: 3.905336380004883
test_ds_index_43: LJ003-0185
[seq]text2mel
phonemes_len: 1105
phnid_len: 209
gradfreqktts dt: 13.325840265955776
gradfreqktts RTF: 1.6805155212774803
[seq]mel2wav
[seq]wav2utmos


 44%|████████████████████████████████████████                                                    | 44/101 [11:15<16:02, 16.89s/it]

utmos: 3.665759325027466
test_ds_index_44: LJ014-0063
[seq]text2mel
phonemes_len: 793
phnid_len: 151
gradfreqktts dt: 8.391005114652216
gradfreqktts RTF: 1.5183631727455469
[seq]mel2wav
[seq]wav2utmos


 45%|████████████████████████████████████████▉                                                   | 45/101 [11:28<14:38, 15.69s/it]

utmos: 4.124660491943359
test_ds_index_45: LJ005-0185
[seq]text2mel
phonemes_len: 588
phnid_len: 113
gradfreqktts dt: 7.406216522213072
gradfreqktts RTF: 1.540865360005267
[seq]mel2wav
[seq]wav2utmos


 46%|█████████████████████████████████████████▉                                                  | 46/101 [11:39<13:13, 14.43s/it]

utmos: 4.3888020515441895
test_ds_index_46: LJ014-0135
[seq]text2mel
phonemes_len: 826
phnid_len: 157
gradfreqktts dt: 8.647651692852378
gradfreqktts RTF: 1.4956758269593604
[seq]mel2wav
[seq]wav2utmos


 47%|██████████████████████████████████████████▊                                                 | 47/101 [11:52<12:37, 14.02s/it]

utmos: 3.8154354095458984
test_ds_index_47: LJ009-0046
[seq]text2mel
phonemes_len: 1011
phnid_len: 193
gradfreqktts dt: 11.516059706918895
gradfreqktts RTF: 1.4716774650962168
[seq]mel2wav
[seq]wav2utmos


 48%|███████████████████████████████████████████▋                                                | 48/101 [12:10<13:18, 15.07s/it]

utmos: 3.8487741947174072
test_ds_index_48: LJ037-0024
[seq]text2mel
phonemes_len: 1505
phnid_len: 285
gradfreqktts dt: 22.363999971654266
gradfreqktts RTF: 2.0448770873763293
[seq]mel2wav
[seq]wav2utmos


 49%|████████████████████████████████████████████▋                                               | 49/101 [12:40<17:05, 19.72s/it]

utmos: 3.902602195739746
test_ds_index_49: LJ002-0217
[seq]text2mel
phonemes_len: 1093
phnid_len: 207
gradfreqktts dt: 12.893555168993771
gradfreqktts RTF: 1.6283844132395107
[seq]mel2wav
[seq]wav2utmos


 50%|█████████████████████████████████████████████▌                                              | 50/101 [12:55<15:26, 18.16s/it]

utmos: 3.9403457641601562
test_ds_index_50: LJ044-0017
[seq]text2mel
phonemes_len: 782
phnid_len: 149
gradfreqktts dt: 9.686596999876201
gradfreqktts RTF: 1.5479292080768077
[seq]mel2wav
[seq]wav2utmos


 50%|██████████████████████████████████████████████▍                                             | 51/101 [13:09<14:15, 17.11s/it]

utmos: 3.538675308227539
test_ds_index_51: LJ017-0074
[seq]text2mel
phonemes_len: 481
phnid_len: 93
gradfreqktts dt: 6.4323719148524106
gradfreqktts RTF: 1.5784566497784862
[seq]mel2wav
[seq]wav2utmos


 51%|███████████████████████████████████████████████▎                                            | 52/101 [13:19<12:14, 14.99s/it]

utmos: 4.008632183074951
test_ds_index_52: LJ033-0153
[seq]text2mel
phonemes_len: 714
phnid_len: 137
gradfreqktts dt: 8.055406176950783
gradfreqktts RTF: 1.5282704622260872
[seq]mel2wav
[seq]wav2utmos


 52%|████████████████████████████████████████████████▎                                           | 53/101 [13:32<11:21, 14.19s/it]

utmos: 4.010202884674072
test_ds_index_53: LJ032-0124
[seq]text2mel
phonemes_len: 1056
phnid_len: 199
gradfreqktts dt: 13.288325835950673
gradfreqktts RTF: 1.7211441769426241
[seq]mel2wav
[seq]wav2utmos


 53%|█████████████████████████████████████████████████▏                                          | 54/101 [13:51<12:16, 15.66s/it]

utmos: 3.869961738586426
test_ds_index_54: LJ018-0287
[seq]text2mel
phonemes_len: 567
phnid_len: 109
gradfreqktts dt: 6.494155132211745
gradfreqktts RTF: 1.5367028745843596
[seq]mel2wav
[seq]wav2utmos


 54%|██████████████████████████████████████████████████                                          | 55/101 [14:01<10:42, 13.97s/it]

utmos: 4.067721843719482
test_ds_index_55: LJ020-0038
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradfreqktts dt: 7.004491795785725
gradfreqktts RTF: 1.596075604508466
[seq]mel2wav
[seq]wav2utmos


 55%|███████████████████████████████████████████████████                                         | 56/101 [14:12<09:48, 13.07s/it]

utmos: 4.128397464752197
test_ds_index_56: LJ001-0007
[seq]text2mel
phonemes_len: 1016
phnid_len: 193
gradfreqktts dt: 13.346037171781063
gradfreqktts RTF: 1.5987923745967296
[seq]mel2wav
[seq]wav2utmos


 56%|███████████████████████████████████████████████████▉                                        | 57/101 [14:33<11:15, 15.35s/it]

utmos: 3.656695604324341
test_ds_index_57: LJ003-0313
[seq]text2mel
phonemes_len: 1175
phnid_len: 223
gradfreqktts dt: 16.08873875392601
gradfreqktts RTF: 1.7519194923557428
[seq]mel2wav
[seq]wav2utmos


 57%|████████████████████████████████████████████████████▊                                       | 58/101 [14:56<12:42, 17.74s/it]

utmos: 3.624356746673584
test_ds_index_58: LJ019-0265
[seq]text2mel
phonemes_len: 590
phnid_len: 113
gradfreqktts dt: 8.017229369841516
gradfreqktts RTF: 1.5448467876555982
[seq]mel2wav
[seq]wav2utmos


 58%|█████████████████████████████████████████████████████▋                                      | 59/101 [15:06<10:44, 15.35s/it]

utmos: 4.427779674530029
test_ds_index_59: LJ038-0281
[seq]text2mel
phonemes_len: 1193
phnid_len: 225
gradfreqktts dt: 14.080007351003587
gradfreqktts RTF: 1.6658662543442495
[seq]mel2wav
[seq]wav2utmos


 59%|██████████████████████████████████████████████████████▋                                     | 60/101 [15:26<11:31, 16.87s/it]

utmos: 4.236476421356201
test_ds_index_60: LJ045-0235
[seq]text2mel
phonemes_len: 1185
phnid_len: 223
gradfreqktts dt: 12.415118182078004
gradfreqktts RTF: 1.615330886015507
[seq]mel2wav
[seq]wav2utmos


 60%|███████████████████████████████████████████████████████▌                                    | 61/101 [15:44<11:32, 17.31s/it]

utmos: 4.176488399505615
test_ds_index_61: LJ038-0255
[seq]text2mel
phonemes_len: 240
phnid_len: 47
gradfreqktts dt: 3.6959364940412343
gradfreqktts RTF: 1.941106128372933
[seq]mel2wav


 61%|████████████████████████████████████████████████████████▍                                   | 62/101 [15:50<09:01, 13.88s/it]

[seq]wav2utmos
utmos: 3.7694027423858643
test_ds_index_62: LJ028-0205
[seq]text2mel
phonemes_len: 899
phnid_len: 171
gradfreqktts dt: 10.167644773609936
gradfreqktts RTF: 1.4970390441913668
[seq]mel2wav
[seq]wav2utmos


 62%|█████████████████████████████████████████████████████████▍                                  | 63/101 [16:02<08:20, 13.17s/it]

utmos: 3.4248993396759033
test_ds_index_63: LJ014-0260
[seq]text2mel
phonemes_len: 1068
phnid_len: 203
gradfreqktts dt: 14.958285619039088
gradfreqktts RTF: 1.7410800142515408
[seq]mel2wav
[seq]wav2utmos


 63%|██████████████████████████████████████████████████████████▎                                 | 64/101 [16:23<09:37, 15.62s/it]

utmos: 3.662825107574463
test_ds_index_64: LJ033-0166
[seq]text2mel
phonemes_len: 967
phnid_len: 183
gradfreqktts dt: 12.016021052841097
gradfreqktts RTF: 1.6146235387525971
[seq]mel2wav
[seq]wav2utmos


 64%|███████████████████████████████████████████████████████████▏                                | 65/101 [16:41<09:50, 16.40s/it]

utmos: 3.4509456157684326
test_ds_index_65: LJ037-0125
[seq]text2mel
phonemes_len: 1085
phnid_len: 205
gradfreqktts dt: 13.558540327940136
gradfreqktts RTF: 1.610807188745043
[seq]mel2wav
[seq]wav2utmos


 65%|████████████████████████████████████████████████████████████                                | 66/101 [17:01<10:09, 17.40s/it]

utmos: 3.95365834236145
test_ds_index_66: LJ013-0142
[seq]text2mel
phonemes_len: 740
phnid_len: 143
gradfreqktts dt: 8.324050412978977
gradfreqktts RTF: 1.5352759603033528
[seq]mel2wav
[seq]wav2utmos


 66%|█████████████████████████████████████████████████████████████                               | 67/101 [17:14<09:03, 15.99s/it]

utmos: 3.469390392303467
test_ds_index_67: LJ031-0199
[seq]text2mel
phonemes_len: 1439
phnid_len: 273
gradfreqktts dt: 19.48487584386021
gradfreqktts RTF: 1.8985148842137904
[seq]mel2wav
[seq]wav2utmos


 67%|█████████████████████████████████████████████████████████████▉                              | 68/101 [17:41<10:36, 19.30s/it]

utmos: 3.6768856048583984
test_ds_index_68: LJ004-0017
[seq]text2mel
phonemes_len: 619
phnid_len: 119
gradfreqktts dt: 7.618922980967909
gradfreqktts RTF: 1.5624744394563095
[seq]mel2wav
[seq]wav2utmos


 68%|██████████████████████████████████████████████████████████████▊                             | 69/101 [17:53<09:05, 17.05s/it]

utmos: 3.871365785598755
test_ds_index_69: LJ024-0115
[seq]text2mel
phonemes_len: 1011
phnid_len: 191
gradfreqktts dt: 11.310101766139269
gradfreqktts RTF: 1.4827562782021195
[seq]mel2wav
[seq]wav2utmos


 69%|███████████████████████████████████████████████████████████████▊                            | 70/101 [18:10<08:48, 17.05s/it]

utmos: 3.489604949951172
test_ds_index_70: LJ017-0171
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradfreqktts dt: 7.780451427213848
gradfreqktts RTF: 1.5918103656664317
[seq]mel2wav
[seq]wav2utmos


 70%|████████████████████████████████████████████████████████████████▋                           | 71/101 [18:19<07:18, 14.63s/it]

utmos: 4.143746376037598
test_ds_index_71: LJ017-0040
[seq]text2mel
phonemes_len: 412
phnid_len: 79
gradfreqktts dt: 5.493225432932377
gradfreqktts RTF: 1.5877414638758247
[seq]mel2wav
[seq]wav2utmos


 71%|█████████████████████████████████████████████████████████████████▌                          | 72/101 [18:27<06:13, 12.87s/it]

utmos: 3.4837825298309326
test_ds_index_72: LJ005-0044
[seq]text2mel
phonemes_len: 796
phnid_len: 151
gradfreqktts dt: 8.177439809776843
gradfreqktts RTF: 1.4922582412405603
[seq]mel2wav
[seq]wav2utmos


 72%|██████████████████████████████████████████████████████████████████▍                         | 73/101 [18:40<05:57, 12.78s/it]

utmos: 4.038662910461426
test_ds_index_73: LJ007-0169
[seq]text2mel
phonemes_len: 399
phnid_len: 77
gradfreqktts dt: 4.783804218750447
gradfreqktts RTF: 1.688698818894841
[seq]mel2wav
[seq]wav2utmos


 73%|███████████████████████████████████████████████████████████████████▍                        | 74/101 [18:47<05:00, 11.11s/it]

utmos: 3.7531344890594482
test_ds_index_74: LJ015-0153
[seq]text2mel
phonemes_len: 1156
phnid_len: 219
gradfreqktts dt: 12.516067792195827
gradfreqktts RTF: 1.4952068243862582
[seq]mel2wav
[seq]wav2utmos


 74%|████████████████████████████████████████████████████████████████████▎                       | 75/101 [19:06<05:51, 13.51s/it]

utmos: 3.9688849449157715
test_ds_index_75: LJ045-0043
[seq]text2mel
phonemes_len: 1166
phnid_len: 221
gradfreqktts dt: 15.983357415068895
gradfreqktts RTF: 1.7251773525721976
[seq]mel2wav
[seq]wav2utmos


 75%|█████████████████████████████████████████████████████████████████████▏                      | 76/101 [19:29<06:47, 16.29s/it]

utmos: 3.802044630050659
test_ds_index_76: LJ050-0010
[seq]text2mel
phonemes_len: 1046
phnid_len: 197
gradfreqktts dt: 10.911808555945754
gradfreqktts RTF: 1.518359871381537
[seq]mel2wav
[seq]wav2utmos


 76%|██████████████████████████████████████████████████████████████████████▏                     | 77/101 [19:45<06:28, 16.20s/it]

utmos: 4.213281631469727
test_ds_index_77: LJ006-0126
[seq]text2mel
phonemes_len: 1061
phnid_len: 201
gradfreqktts dt: 11.065158960875124
gradfreqktts RTF: 1.524917219295603
[seq]mel2wav
[seq]wav2utmos


 77%|███████████████████████████████████████████████████████████████████████                     | 78/101 [20:02<06:15, 16.32s/it]

utmos: 4.092154026031494
test_ds_index_78: LJ018-0356
[seq]text2mel
phonemes_len: 979
phnid_len: 185
gradfreqktts dt: 11.671179129742086
gradfreqktts RTF: 1.4959397077916492
[seq]mel2wav
[seq]wav2utmos


 78%|███████████████████████████████████████████████████████████████████████▉                    | 79/101 [20:19<06:07, 16.71s/it]

utmos: 3.786849021911621
test_ds_index_79: LJ040-0223
[seq]text2mel
phonemes_len: 1070
phnid_len: 201
gradfreqktts dt: 12.532923221588135
gradfreqktts RTF: 1.623302144243529
[seq]mel2wav
[seq]wav2utmos


 79%|████████████████████████████████████████████████████████████████████████▊                   | 80/101 [20:33<05:33, 15.89s/it]

utmos: 3.7475287914276123
test_ds_index_80: LJ008-0281
[seq]text2mel
phonemes_len: 651
phnid_len: 125
gradfreqktts dt: 7.946544865611941
gradfreqktts RTF: 1.5346597733914595
[seq]mel2wav
[seq]wav2utmos


 80%|█████████████████████████████████████████████████████████████████████████▊                  | 81/101 [20:45<04:55, 14.78s/it]

utmos: 4.4168291091918945
test_ds_index_81: LJ008-0222
[seq]text2mel
phonemes_len: 515
phnid_len: 99
gradfreqktts dt: 6.798441894818097
gradfreqktts RTF: 1.6087058269739338
[seq]mel2wav
[seq]wav2utmos


 81%|██████████████████████████████████████████████████████████████████████████▋                 | 82/101 [20:54<04:03, 12.80s/it]

utmos: 3.664729356765747
test_ds_index_82: LJ046-0123
[seq]text2mel
phonemes_len: 1130
phnid_len: 213
gradfreqktts dt: 13.367738192901015
gradfreqktts RTF: 1.6194105306866835
[seq]mel2wav
[seq]wav2utmos


 82%|███████████████████████████████████████████████████████████████████████████▌                | 83/101 [21:13<04:26, 14.83s/it]

utmos: 3.644918441772461
test_ds_index_83: LJ030-0044
[seq]text2mel
phonemes_len: 965
phnid_len: 185
gradfreqktts dt: 13.047079807147384
gradfreqktts RTF: 1.6526201157375908
[seq]mel2wav
[seq]wav2utmos


 83%|████████████████████████████████████████████████████████████████████████████▌               | 84/101 [21:32<04:32, 16.04s/it]

utmos: 4.186734676361084
test_ds_index_84: LJ018-0051
[seq]text2mel
phonemes_len: 820
phnid_len: 157
gradfreqktts dt: 9.97912065172568
gradfreqktts RTF: 1.4793971222202509
[seq]mel2wav
[seq]wav2utmos


 84%|█████████████████████████████████████████████████████████████████████████████▍              | 85/101 [21:47<04:13, 15.85s/it]

utmos: 3.915384531021118
test_ds_index_85: LJ042-0231
[seq]text2mel
phonemes_len: 574
phnid_len: 111
gradfreqktts dt: 7.118880120106041
gradfreqktts RTF: 1.5844164511500547
[seq]mel2wav
[seq]wav2utmos


 85%|██████████████████████████████████████████████████████████████████████████████▎             | 86/101 [21:58<03:34, 14.28s/it]

utmos: 4.328577518463135
test_ds_index_86: LJ011-0121
[seq]text2mel
phonemes_len: 1123
phnid_len: 213
gradfreqktts dt: 14.326740434858948
gradfreqktts RTF: 1.6743588162983367
[seq]mel2wav
[seq]wav2utmos


 86%|███████████████████████████████████████████████████████████████████████████████▏            | 87/101 [22:18<03:45, 16.10s/it]

utmos: 3.712336540222168
test_ds_index_87: LJ016-0186
[seq]text2mel
phonemes_len: 848
phnid_len: 161
gradfreqktts dt: 10.332503892015666
gradfreqktts RTF: 1.4982619871826692
[seq]mel2wav
[seq]wav2utmos


 87%|████████████████████████████████████████████████████████████████████████████████▏           | 88/101 [22:34<03:28, 16.02s/it]

utmos: 4.1749396324157715
test_ds_index_88: LJ011-0164
[seq]text2mel
phonemes_len: 1007
phnid_len: 191
gradfreqktts dt: 13.856076635885984
gradfreqktts RTF: 1.7271531850425443
[seq]mel2wav
[seq]wav2utmos


 88%|█████████████████████████████████████████████████████████████████████████████████           | 89/101 [22:54<03:26, 17.23s/it]

utmos: 3.5691959857940674
test_ds_index_89: LJ026-0039
[seq]text2mel
phonemes_len: 1150
phnid_len: 219
gradfreqktts dt: 12.989226150792092
gradfreqktts RTF: 1.6356704394243742
[seq]mel2wav
[seq]wav2utmos


 89%|█████████████████████████████████████████████████████████████████████████████████▉          | 90/101 [23:13<03:15, 17.74s/it]

utmos: 3.1614885330200195
test_ds_index_90: LJ003-0105
[seq]text2mel
phonemes_len: 971
phnid_len: 185
gradfreqktts dt: 12.71089394390583
gradfreqktts RTF: 1.6791795165304086
[seq]mel2wav
[seq]wav2utmos


 90%|██████████████████████████████████████████████████████████████████████████████████▉         | 91/101 [23:32<03:00, 18.06s/it]

utmos: 4.08350944519043
test_ds_index_91: LJ039-0104
[seq]text2mel
phonemes_len: 383
phnid_len: 75
gradfreqktts dt: 5.450477220118046
gradfreqktts RTF: 1.5914065506303352
[seq]mel2wav
[seq]wav2utmos


 91%|███████████████████████████████████████████████████████████████████████████████████▊        | 92/101 [23:40<02:16, 15.14s/it]

utmos: 4.033607006072998
test_ds_index_92: LJ002-0038
[seq]text2mel
phonemes_len: 642
phnid_len: 123
gradfreqktts dt: 8.71982081187889
gradfreqktts RTF: 1.5453964835867535
[seq]mel2wav
[seq]wav2utmos


 92%|████████████████████████████████████████████████████████████████████████████████████▋       | 93/101 [23:53<01:56, 14.53s/it]

utmos: 3.897439956665039
test_ds_index_93: LJ046-0194
[seq]text2mel
phonemes_len: 1259
phnid_len: 239
gradfreqktts dt: 15.725392325781286
gradfreqktts RTF: 1.6221224774676148
[seq]mel2wav
[seq]wav2utmos


 93%|█████████████████████████████████████████████████████████████████████████████████████▌      | 94/101 [24:17<02:00, 17.15s/it]

utmos: 3.3616788387298584
test_ds_index_94: LJ008-0115
[seq]text2mel
phonemes_len: 845
phnid_len: 161
gradfreqktts dt: 9.761282260995358
gradfreqktts RTF: 1.5656735470128291
[seq]mel2wav
[seq]wav2utmos


 94%|██████████████████████████████████████████████████████████████████████████████████████▌     | 95/101 [24:31<01:38, 16.40s/it]

utmos: 3.8991506099700928
test_ds_index_95: LJ016-0104
[seq]text2mel
phonemes_len: 521
phnid_len: 101
gradfreqktts dt: 6.2250189878977835
gradfreqktts RTF: 1.591033807992699
[seq]mel2wav
[seq]wav2utmos


 95%|███████████████████████████████████████████████████████████████████████████████████████▍    | 96/101 [24:39<01:08, 13.72s/it]

utmos: 4.358458042144775
test_ds_index_96: LJ019-0301
[seq]text2mel
phonemes_len: 1122
phnid_len: 213
gradfreqktts dt: 13.690565771888942
gradfreqktts RTF: 1.6679023120919774
[seq]mel2wav
[seq]wav2utmos


 96%|████████████████████████████████████████████████████████████████████████████████████████▎   | 97/101 [24:58<01:01, 15.38s/it]

utmos: 4.352012634277344
test_ds_index_97: LJ028-0012
[seq]text2mel
phonemes_len: 1165
phnid_len: 219
gradfreqktts dt: 15.45112262479961
gradfreqktts RTF: 1.7351351342325587
[seq]mel2wav
[seq]wav2utmos


 97%|█████████████████████████████████████████████████████████████████████████████████████████▎  | 98/101 [25:20<00:52, 17.47s/it]

utmos: 4.210286617279053
test_ds_index_98: LJ018-0059
[seq]text2mel
phonemes_len: 1004
phnid_len: 191
gradfreqktts dt: 13.267337730154395
gradfreqktts RTF: 1.6140580693298927
[seq]mel2wav
[seq]wav2utmos


 98%|██████████████████████████████████████████████████████████████████████████████████████████▏ | 99/101 [25:40<00:36, 18.04s/it]

utmos: 3.9591476917266846
test_ds_index_99: LJ029-0081
[seq]text2mel
phonemes_len: 1427
phnid_len: 269
gradfreqktts dt: 21.092320363037288
gradfreqktts RTF: 1.9043405398526443
[seq]mel2wav
[seq]wav2utmos


 99%|██████████████████████████████████████████████████████████████████████████████████████████ | 100/101 [26:09<00:21, 21.35s/it]

utmos: 4.167963981628418
test_ds_index_100: LJ017-0230
[seq]text2mel
phonemes_len: 1085
phnid_len: 207
gradfreqktts dt: 12.798251107800752
gradfreqktts RTF: 1.5395940823968148
[seq]mel2wav
[seq]wav2utmos


100%|███████████████████████████████████████████████████████████████████████████████████████████| 101/101 [26:28<00:00, 15.73s/it]

utmos: 4.021300315856934





In [20]:
#RESULT_JSON_PATH = RESULT_DIR_PATH / 'eval4mid.json'
if RESULT_JSON_PATH.exists() == False:
    with open(RESULT_JSON_PATH, 'w') as f:
        for entry in eval_list:
            f.write(json.dumps(entry) + '\n')
    print(f'Make {RESULT_JSON_PATH}')
else:
    print(f'Already Exists {RESULT_JSON_PATH}')

Make result4eval/infer4mid/gradfreqktts/cpu/e1000/eval4mid.json


In [21]:
eval_list

[{'name': 'LJ045-0049',
  'phonemes_len': 1127,
  'phnid_len': 215,
  'dt': 22.842133776284754,
  'RTF4mel': 2.8848346417194306,
  'utmos': 3.8492660522460938},
 {'name': 'LJ017-0027',
  'phonemes_len': 411,
  'phnid_len': 79,
  'dt': 5.010633602272719,
  'RTF4mel': 1.80577391033789,
  'utmos': 4.283524513244629},
 {'name': 'LJ023-0031',
  'phonemes_len': 223,
  'phnid_len': 45,
  'dt': 3.5765119860880077,
  'RTF4mel': 1.855753230733259,
  'utmos': 3.367459297180176},
 {'name': 'LJ032-0046',
  'phonemes_len': 650,
  'phnid_len': 125,
  'dt': 7.788422550074756,
  'RTF4mel': 1.5315952949231983,
  'utmos': 3.553670644760132},
 {'name': 'LJ030-0026',
  'phonemes_len': 1025,
  'phnid_len': 195,
  'dt': 12.226165103726089,
  'RTF4mel': 1.6251141766562998,
  'utmos': 3.5788278579711914},
 {'name': 'LJ020-0041',
  'phonemes_len': 1154,
  'phnid_len': 219,
  'dt': 13.538582754321396,
  'RTF4mel': 1.530336233456297,
  'utmos': 4.138723373413086},
 {'name': 'LJ001-0070',
  'phonemes_len': 1137,
 

## recheck eval_json

In [22]:
target_model = 'gradfreqktts'
target_device = 'cpu'
target_epoch_str = 'e1000'

print(target_model)
print(RESULT_DIR_PATH)

# /result4eval/infer4mid/gradtts/cpu/e500/eval4mid.json
# result4eval/<runtime_name>/<model_name>/<device_name>/<target_epoch>/eval.json
# or
# result4eval/json4mid/<target_epoch>/eval4mid_<model_shortword>_<kernel_size>.json
"""
eval_info = {
    'gradtts_cpu': 'result4eval/infer4mid/gradtts/cpu/eval4mid.json',
    'gradseptts_cpu': 'result4eval/infer4mid/gradseptts/cpu/eval4mid.json',
    'gradtfktts_cpu': 'result4eval/infer4mid/gradtfktts/cpu/eval4mid.json',
    'gradtfk5tts_cpu': 'result4eval/infer4mid/gradtfk5tts/cpu/eval4mid.json',
    'gradtimektts_cpu': 'result4eval/infer4mid/gradtimektts/cpu/eval4mid.json',
    'gradfreqktts_cpu': 'result4eval/infer4mid/gradfreqktts/cpu/eval4mid.json',
}
"""
"""
eval_info = {
    'cpu': {
        'e500': {
            'gradtts':f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradseptts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfktts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfk5tts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtimektts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradfreqktts': f'{RESULT_DIR_PATH}/eval4mid.json',
        },
        'e1000':{
            'gradtts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradseptts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfktts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfk5tts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtimektts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradfreqktts': f'{RESULT_DIR_PATH}/eval4mid.json',
        }
    }
}
"""

eval_jsonl_path = RESULT_DIR_PATH / 'eval4mid.json'
print(f'eval_jsonl_path: {eval_jsonl_path}')

if eval_jsonl_path.exists() == True:
    print(f'Exist {eval_jsonl_path}')
    import json
    with open(eval_jsonl_path) as f:
        eval_jsonl_list = [json.loads(l) for l in f]
else:
    print(f'No Exists {eval_jsonl_path}')

gradfreqktts
result4eval/infer4mid/gradfreqktts/cpu/e1000
eval_jsonl_path: result4eval/infer4mid/gradfreqktts/cpu/e1000/eval4mid.json
Exist result4eval/infer4mid/gradfreqktts/cpu/e1000/eval4mid.json


In [23]:
eval_jsonl_list

[{'name': 'LJ045-0049',
  'phonemes_len': 1127,
  'phnid_len': 215,
  'dt': 22.842133776284754,
  'RTF4mel': 2.8848346417194306,
  'utmos': 3.8492660522460938},
 {'name': 'LJ017-0027',
  'phonemes_len': 411,
  'phnid_len': 79,
  'dt': 5.010633602272719,
  'RTF4mel': 1.80577391033789,
  'utmos': 4.283524513244629},
 {'name': 'LJ023-0031',
  'phonemes_len': 223,
  'phnid_len': 45,
  'dt': 3.5765119860880077,
  'RTF4mel': 1.855753230733259,
  'utmos': 3.367459297180176},
 {'name': 'LJ032-0046',
  'phonemes_len': 650,
  'phnid_len': 125,
  'dt': 7.788422550074756,
  'RTF4mel': 1.5315952949231983,
  'utmos': 3.553670644760132},
 {'name': 'LJ030-0026',
  'phonemes_len': 1025,
  'phnid_len': 195,
  'dt': 12.226165103726089,
  'RTF4mel': 1.6251141766562998,
  'utmos': 3.5788278579711914},
 {'name': 'LJ020-0041',
  'phonemes_len': 1154,
  'phnid_len': 219,
  'dt': 13.538582754321396,
  'RTF4mel': 1.530336233456297,
  'utmos': 4.138723373413086},
 {'name': 'LJ001-0070',
  'phonemes_len': 1137,
 

## [seq] analysis json

It is sequence of <span style="color: red;">json analysis section</span> from below cells

The 0th data seemed to have a slower inference time than usual, perhaps because the model took longer to load. Therefore, we try to use data from 1 to 101.

In [23]:
# for loading eval_jsonl of each models
target_model = 'gradtts'
target_device = 'cpu'
target_epoch_str = 'e500'
eval_json_dir = 'result4eval/json4mid' # or result4eval/infer4mid

eval_json_dirpath = f'{eval_json_dir}/{target_epoch_str}'
print(f'target_model: {target_model}')
print(f'target_jsonl_path: {eval_json_dirpath}')


# original eval4mid.json path: 'gradtts_cpu': 'result4eval/infer4mid/gradtts/cpu/eval4mid.json'
# diff result4eval/infer4mid/gradtts/cpu/eval4mid.json result4eval/json4mid/eval4mid_gt_k3.json
# no difference between upper files

"""midtermstyle
eval_info = {
    'lj': f'result4eval/infer4PBL/groundtruth/evalljPBL.json',
    'lj_via_hifigan': f'result4eval/infer4PBL/groundtruth/eval4PBL.json',
    'gradtts_cpu': f'{eval_json_dirpath}/eval4mid_gt_k3.json',
    'gradseptts_cpu': f'{eval_json_dirpath}/eval4mid_sgt_k3.json',
    'gradtfktts_cpu': f'{eval_json_dirpath}/eval4mid_tfk_k3.json',
    'gradtfk5tts_cpu': f'{eval_json_dirpath}/eval4mid_tfk_k5.json',
    'gradtimektts_cpu': f'{eval_json_dirpath}/eval4mid_timek_k3.json',
    'gradfreqktts_cpu': f'{eval_json_dirpath}/eval4mid_freqk_k3.json',
}

"""

# result4eval/<runtime_name>/<model_name>/<device_name>/<target_epoch>/eval.json
# or
# result4eval/json4mid/<target_epoch>/eval4mid_<model_shortword>_<kernel_size>.json

eval_info = {
    'lj': f'result4eval/infer4PBL/groundtruth/evalljPBL.json',
    'lj_via_hifigan': f'result4eval/infer4PBL/groundtruth/eval4PBL.json',
    'cpu': {
        'e500': {
            'gradtts':f'{eval_json_dirpath}/eval4mid_gt_k3.json',
            'gradseptts': f'{eval_json_dirpath}/eval4mid_sgt_k3.json',
            'gradtfktts': f'{eval_json_dirpath}/eval4mid_tfk_k3.json',
            'gradtfk5tts': f'{eval_json_dirpath}/eval4mid_tfk_k5.json',
            'gradtimektts': f'{eval_json_dirpath}/eval4mid_timek_k3.json',
            'gradfreqktts': f'{eval_json_dirpath}/eval4mid_freqk_k3.json',
        },
        'e1000':{
            
        }
    }
}

#eval_jsonl_path = Path(eval_info[eval_target]) # midtermstyle
eval_jsonl_path = Path(eval_info[target_device][target_epoch_str][target_model])


if eval_jsonl_path.exists() == True:
    print(f'Exist: {eval_jsonl_path}')
    import json
    with open(eval_jsonl_path) as f:
        eval_jsonl_list = [json.loads(l) for l in f]
else:
    print(f'No Exists: {eval_jsonl_path}')


# for loading test_dataset.json
test_dataset_json_path = Path('configs/test_dataset.json')
import json
with open(test_dataset_json_path) as f:
    test_dataset_list = [json.loads(l) for l in f]
print(f'load: {test_dataset_json_path}')

target_model: gradtts
target_jsonl_path: result4eval/json4mid/e500
Exist: result4eval/json4mid/e500/eval4mid_gt_k3.json
load: configs/test_dataset.json


In [24]:
def round_significant_digits(value, significant_digits=5):
    if value == 0:
        return 0

    import math
    scale = math.floor(-math.log10(abs(value)))  # Find the first nonzero after the decimal point
    factor = 10 ** (scale + significant_digits - 1)  # Scale to hold 5 significant digits

    rounded_value = round(value * factor,1) / factor  # Adjust and round off the scale
    return rounded_value

#input_value = 0.06238165879957992
#input_value = 0.007710418871435095

# Rounded to the nearest whole number to ensure significant figures
#result = round_significant_digits(input_value, significant_digits=5)
#print(result) 

In [25]:
# for culc phonemes length
#text = test_ds_list[i]['text']
#phonemes, phnid = text2phnid(text, phn2id, 'en')
#phonemes_len_int = len(phonemes)
#phnid_len_int = len(phnid)

#test_dataset_list[0][0]

In [26]:
# The 0th data seemed to have a slower inference time than usual, 
# perhaps because the model took longer to load.
# Therefore, we try to use data from 1 to 101

#eval_jsonl_list[0]

In [27]:
#counter_list = list(range(1,101))
#print(f'counter_list_len: {len(counter_list)}')
#print(f'counter_width: {counter_list[0]}-{counter_list[-1]}')
#print(len(dt_nparr))

In [28]:
# culc for evaluation Indicates
dt_list = [eval_jsonl_list[n]['dt'] for n in range(len(eval_jsonl_list))]
RTF4mel_list = [eval_jsonl_list[n]['RTF4mel'] for n in range(len(eval_jsonl_list))]
utmos_list = [eval_jsonl_list[n]['utmos'] for n in range(len(eval_jsonl_list))]

In [29]:
dt_nparr = np.array(dt_list[1:101])
RTF4mel_nparr = np.array(RTF4mel_list[1:101])
utmos_nparr = np.array(utmos_list[1:101])
print(len(dt_nparr))

100


In [30]:
significant_digits = 5

# for culc difference time to infer text2mel
dt_mean = round_significant_digits(np.mean(dt_nparr), significant_digits=significant_digits)
dt_var = round_significant_digits(np.var(dt_nparr), significant_digits=significant_digits)
dt_std = round_significant_digits(np.std(dt_nparr), significant_digits=significant_digits)
print(f'{target_model}: dt ---------------------------')
print(f'dt mean: {dt_mean}')
print(f'dt var: {dt_var}')
print(f'dt std: {dt_std}')

# for culc RTF4mel to infer text2mel
RTF4mel_mean = round_significant_digits(np.mean(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_var = round_significant_digits(np.var(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_std = round_significant_digits(np.std(RTF4mel_nparr), significant_digits=significant_digits)
print(f'{target_model}: RTF ---------------------------')
print(f'RTF mean: {RTF4mel_mean}')
print(f'RTF var: {RTF4mel_var}')
print(f'RTF std: {RTF4mel_std}')

# for culc utmos to infer 
print(f'{target_model}: utmos ---------------------------')
utmos_mean = round_significant_digits(np.mean(utmos_nparr), significant_digits=significant_digits)
utmos_var = round_significant_digits(np.var(utmos_nparr), significant_digits=significant_digits)
utmos_std = round_significant_digits(np.std(utmos_nparr), significant_digits=significant_digits)
print(f'utmos mean: {utmos_mean}')
print(f'utmos var: {utmos_var}')
print(f'utmos std: {utmos_std}')

gradfreqktts: dt ---------------------------
dt mean: 14.776
dt var: 24.908
dt std: 4.9908
gradfreqktts: RTF ---------------------------
RTF mean: 2.2504
RTF var: 0.0038915
RTF std: 0.062382
gradfreqktts: utmos ---------------------------
utmos mean: 4.1955
utmos var: 0.065222
utmos std: 0.25539
