## text2mos straight eval

<span style="font-size: 200%; color: red;">First! check the changing "your config.yaml" </span>

## [seq] import

In [1]:
import os
import json
import yaml
import sys
import time
import copy
import IPython.display as ipd
import pprint
from pathlib import Path
from tqdm import tqdm


import numpy as np
import torch
import torchaudio
from librosa.filters import mel as librosa_mel_fn
#import matplotlib
#matplotlib.use("Agg")
import matplotlib.pyplot as plt
from scipy.io.wavfile import write


import toybox

In [2]:
def plot_audio(audio, samplerate, title='time-domain waveform'):
    """
    usage:
        # audio is [channel, time(num_frames)] ex.torch.Size([1, 68608])
        # audio[0,:]: list of 1ch audio data
        # audio.shape[1]: int value of 1ch audio data length
        audio, sample_rate = torchaudio.load(str(iwav_path))
        %matplotlib inline
        plot_audio(audio, sample_rate)
    """
    # transform to mono
    channel = 0
    audio = audio[channel,:].view(1,-1)
    # to numpy
    audio = audio.to('cpu').detach().numpy().copy()
    time = np.linspace(0., audio.shape[1]/samplerate, audio.shape[1])
    
    fig, ax = plt.subplots(figsize=(12,9))
    
    ax.plot(time, audio[0, :])
    ax.set_title(title, fontsize=20, y=-0.12)
    ax.tick_params(direction='in')
    #ax.set_xlim(0, 3)
    ax.set_xlabel('Time')
    ax.set_ylabel('Amp')
    #ax.legend()
    plt.tight_layout()
    fig.canvas.draw()
    plt.show()
    #fig.savefig('figure.png')
    plt.close(fig)
    return fig

def plot_mel(tensors:list, titles:list[str]):
    """
    usage:
        mel = mel_process(...)
        fig_mel = plot_mel([mel_groundtruth[0], mel_prediction[0]],
                            ['groundtruth', 'inferenced(model)'])

    """
    xlim = max([t.shape[1] for t in tensors])
    fig, axs = plt.subplots(nrows=len(tensors),
                            ncols=1,
                            figsize=(12, 9),
                            constrained_layout=True)

    if len(tensors) == 1:
        axs = [axs]
    
    for i in range(len(tensors)):
        im = axs[i].imshow(tensors[i],
                           aspect="auto",
                           origin="lower",
                           interpolation='none')
        #plt.colorbar(im, ax=axs[i])
        fig.colorbar(im, ax=axs[i])
        axs[i].set_title(titles[i])
        axs[i].set_xlim([0, xlim])
    fig.canvas.draw()
    #plt.show()
    #plt.close()
    plt.close(fig)  # fig.close() 
    return fig

def convert_phn_to_id(phonemes, phn2id):
    """
    phonemes: phonemes separated by ' '
    phn2id: phn2id dict
    """
    return [phn2id[x] for x in ['<bos>'] + phonemes.split(' ') + ['<eos>']]


def text2phnid(text, phn2id, language='en', add_blank=True):
    if language == 'en':
        from text import G2pEn
        word2phn = G2pEn()
        phonemes = word2phn(text)
        if add_blank:
            phonemes = ' <blank> '.join(phonemes)
        return phonemes, convert_phn_to_id(phonemes, phn2id)
    else:
        raise ValueError(
            'Language should be en (for English)!')

In [3]:
# 
# ckpt_file_dir: logs4model/<model_name>/<runtime_name>/ckpt/
#500_397001,500_397003
info_models_e500 = [
    {
        "model_name": "gradtts",
        "config_path": "configs/config_gt_k3.yaml",
        "runtime_name": "run_gt_k3",
        "ckpt_filename": "gradtts_480_381121.pt.pt"
    },
    {
        "model_name": "gradseptts",
        "config_path": "configs/config_sgt_k3.yaml",
        "runtime_name": "run_sgt_k3",
        "ckpt_filename": "gradseptts_480_381121.pt"
    },
    {
        "model_name": "gradtfktts",
        "config_path": "configs/config_tfk_k3.yaml",
        "runtime_name": "run_tfk_k3",
        "ckpt_filename": "gradtfktts_480_381121.pt"
    },
    {
        "model_name": "gradtfk5tts",
        "config_path": "configs/config_tfk_k5.yaml",
        "runtime_name": "run_tfk_k5",
        "ckpt_filename": "gradtfk5tts_480_381121.pt"
    },
    {
        "model_name": "gradtimektts",
        "config_path": "configs/config_timek_k3.yaml",
        "runtime_name": "run_timek_k3",
        "ckpt_filename": "gradtimektts_480_381121.pt"
    },
    {
        "model_name": "gradfreqktts",
        "config_path": "configs/config_freqk_k3.yaml",
        "runtime_name": "run_freqk_k3",
        "ckpt_filename": "gradfreqktts_480_381121.pt"
    },
    {
        "model_name": "gradtfkfultts",
        "config_path": "configs/config_tfkfulmask_k3.yaml",
        "runtime_name": "run_tfkfulmask_k3",
        "ckpt_filename":"gradtfkfulttsmask_500_396308.pt"
    }
]

# logs4model/gradtfkfultts/run_tfkfulmask_k3/ckpt/
"""
    {
        "model_name": "gradtfkfultts",
        "config_path": "configs/config_tfkful_k3.yaml",
        "runtime_name": "run_tfkful_k3",
        "ckpt_filename": "gradtfkfultts_480_381123.pt"
    }
]
"""

info_models_e1000 = [
    {
        "model_name": "gradtts",
        "config_path": "configs/config_gt_k3.yaml",
        "runtime_name": "run_gt_k3",
        "ckpt_filename": "gradtts_1000_794002.pt"
    },
    {
        "model_name": "gradseptts",
        "config_path": "configs/config_sgt_k3.yaml",
        "runtime_name": "run_sgt_k3",
        "ckpt_filename": "gradseptts_1000_794002.pt"
    },
    {
        "model_name": "gradtfktts",
        "config_path": "configs/config_tfk_k3.yaml",
        "runtime_name": "run_tfk_k3",
        "ckpt_filename": "gradtfktts_1000_794002.pt"
    },
    {
        "model_name": "gradtfk5tts",
        "config_path": "configs/config_tfk_k5.yaml",
        "runtime_name": "run_tfk_k5",
        "ckpt_filename": "gradtfk5tts_1000_794002.pt"
    },
    {
        "model_name": "gradtimektts",
        "config_path": "configs/config_timek_k3.yaml",
        "runtime_name": "run_timek_k3",
        "ckpt_filename": "gradtimektts_1000_794002.pt"
    },
    {
        "model_name": "gradfreqktts",
        "config_path": "configs/config_freqk_k3.yaml",
        "runtime_name": "run_freqk_k3",
        "ckpt_filename": "gradfreqktts_1000_794002.pt"
    }
]



target_epoch = 500
print(f'target_epoch: {target_epoch}')
if target_epoch == 500:
    info_models = copy.deepcopy(info_models_e500)
elif target_epoch == 1000:
    info_models = copy.deepcopy(info_models_e1000)
else:
    print('Do not supported')

target_epoch: 500


you can choose

- <span style="font-size: 200%; color: red;">inference section</span>
- <span style="font-size: 200%; color: red;">json analysis section</span>

if you choose <span style="color: red;">inference section</span>,
you keep go ahead below process.

but you choose <span style="color: red;">json analysis section</span>,
you need to <span style="color: blue;">jump to the end this page</span>.

## [seq] check configuration

In [6]:
# First, please check changing <model_name>
config_yaml = 'configs/config_exp_mid.yaml'
config = toybox.load_yaml_and_expand_var('configs/config_exp_mid.yaml')

In [7]:
# for model
model_name = config['model_name']
if model_name == 'gradtts':
    choise_idx = 0
elif model_name == 'gradseptts':
    choise_idx = 1
elif model_name == 'gradtfktts':
    choise_idx = 2
elif model_name == 'gradtfk5tts':
    choise_idx = 3
elif model_name == 'gradtimektts':
    choise_idx = 4
elif model_name == 'gradfreqktts':
    choise_idx = 5
elif model_name == 'gradtfkfultts':
    choise_idx = 6
else:
    os._exit(os.EX_OK)
    print('Alart: carefully chose model ')
print(f'model_name: {model_name}')

# for runtime to load model
runtime_name = config['runtime_name']
print(f'runtime_name: {runtime_name}')
config_path4model = info_models[choise_idx]["config_path"]
runtime_name4model = info_models[choise_idx]["runtime_name"]
ckpt_dir = f'logs4model/{model_name}/{runtime_name4model}/ckpt'
ckpt_path = ckpt_dir + "/" + info_models[choise_idx]["ckpt_filename"]
print(f"ckpt_path: {ckpt_path}")

model_name: gradtfkfultts
runtime_name: run_tfkfulmask_k3
ckpt_path: logs4model/gradtfkfultts/run_tfkfulmask_k3/ckpt/gradtfkfulttsmask_500_396308.pt


In [8]:
# for audio params
n_mels: int = config['n_mels'] # 80
n_fft: int = config['n_fft'] # 1024
sample_rate: int = config['sample_rate'] # 22050
hop_size: int = config['hop_size'] # 256
win_size: int = config['win_size'] # 1024
f_min: int = config['f_min'] # 0
f_max: int = config['f_max'] # 8000
random_seed: int = config['random_seed'] # 1234
print(n_mels, n_fft, sample_rate, hop_size, win_size, f_min, f_max, random_seed)

80 1024 22050 256 1024 0 8000 1234


In [9]:
print(f"phn2id_path: {config['phn2id_path']}")
with open(config['phn2id_path']) as f:
    phn2id = json.load(f)

vocab_size = len(phn2id) + 1

phn2id_path: ./configs/phn2id.json


In [10]:
# for hifigan
# setting file paths
# from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS/hifi-gan
# https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing
HiFiGAN_CONFIG = './hifigan/official_pretrained/LJ_V2/config.json'
HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V2/generator_v2'

from hifigan import models, env

with open(HiFiGAN_CONFIG) as f:
    hifigan_hparams = env.AttrDict(json.load(f))

hifigan_randomseed = hifigan_hparams.seed
print(f'hifigan_randomseed: {hifigan_randomseed}')

hifigan_randomseed: 1234


## [seq] device setting

In [11]:
import os

print(f"all cpu at using device: {os.cpu_count()}")
print(f"Number of available CPU: {len(os.sched_getaffinity(0))}") # Number of available CPUs can also be obtained. ,use systemcall at linux.
print(f"GPU_name: {torch.cuda.get_device_name()}\nGPU avail: {torch.cuda.is_available()}\n")

all cpu at using device: 52
Number of available CPU: 4


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [12]:
DEVICE = 'cpu' # 'cuda' or 'cpu'
flag_avail_cuda = torch.cuda.is_available()
if flag_avail_cuda == True:
    print('avail cuda')
    print('use cuda')
elif flag_avail_cuda == False and DEVICE=='cpu':
    print('Not avail cuda')
    print('use cpu')
else:
    os._exit(os.EX_OK)

device = torch.device(DEVICE)
print(f'device: {device}')

# setting random_seed ==============
print(f'device: {random_seed}')
toybox.set_seed(random_seed)

# reprint target epoch for ckpt
print(f'check for target_epoch: {target_epoch}')

Not avail cuda
use cpu
device: cpu
device: 1234
check for target_epoch: 500


In [13]:
print(str(torch.get_default_device()))

cpu


## [seq] setting path

In [14]:
# for test_dataset
# RESULT_DIR_PATH = RESULT_DIR_PATH
# RESULT_MEL_DIR_PATH = IMEL_DIR_PATH
# RESULT_WAV_DIR_PATH = IWAV_DIR_PATH
# RESULT_JSON_DIR_PATH = RESULT_JSON_PATH
test_ds_path = Path(config['test_datalist_path'])

# text2mel
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
#print(RESULT_MEL_DIR_PATH)

# mel2wav
#IMEL_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/mel')
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'

# for utmos
#IWAV_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/wav')
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_JSON_DIR_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME

if model_name == 'groundtruth':
    # for mel2wav
    #IMEL_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/mel/')
    RESULT_MEL_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/mel/')
    # for utmos
    #/result4eval/infer4PBL/groundtruth/cuda/
    #IWAV_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/wav/')
    RESULT_WAV_DIR_PATH = Path('./result4eval/infer4PBL/groundtruth/cuda/wav/')
    RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}')

# style for mid
#RESULTS_JSON_NAME = 'eval4mid.json'
#RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}')
#RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
#RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'
#RESULT_JSON_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME

RESULTS_JSON_NAME = 'eval4mid.json'
RESULT_DIR_PATH = Path(f'./result4eval/{runtime_name}/{model_name}/{DEVICE}/{"e"+str(target_epoch)}')
RESULT_MEL_DIR_PATH = RESULT_DIR_PATH / 'mel'
RESULT_WAV_DIR_PATH = RESULT_DIR_PATH / 'wav'
RESULT_JSON_PATH = RESULT_DIR_PATH / RESULTS_JSON_NAME
RESULTS_JSON_NAME = 'eval4mid.json'
print(f'RESULT_DIR_PATH: {RESULT_DIR_PATH}')

RESULT_DIR_PATH: result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500


In [16]:
# check path

# for text2mel
print('test_ds_path-----------------------------------------')
if test_ds_path.exists():
    print(f'Exists {str(test_ds_path)}')
    with open(config['test_datalist_path']) as j:
        test_ds_list = json.load(j)
    print(f'loaded {test_ds_path}')
else:
    print(f'No exist {test_ds_path}')

print('RESULT_DIR_PATH-------------------------------------------')
if RESULT_DIR_PATH.exists():
    print(f'Exists {RESULT_DIR_PATH}')
else:
    RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_DIR_PATH}')

print('RESULT_MEL_DIR_PATH-------------------------------------------')
if RESULT_MEL_DIR_PATH.exists():
    print(f'Exists {RESULT_MEL_DIR_PATH}')
else:
    RESULT_MEL_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_MEL_DIR_PATH}')

print('RESULT_WAV_DIR_PATH-------------------------------------------')
if RESULT_WAV_DIR_PATH.exists():
    print(f'Exists {RESULT_WAV_DIR_PATH}')
else:
    RESULT_WAV_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_WAV_DIR_PATH}')

print('RESULT_JSON_PATH-------------------------------------------')
if RESULT_JSON_PATH.exists():
    print(f'Exists {RESULT_JSON_PATH}')
else:
    #RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_JSON_PATH}')

test_ds_path-----------------------------------------
Exists configs/test_dataset.json
loaded configs/test_dataset.json
RESULT_DIR_PATH-------------------------------------------
Exists result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500
RESULT_MEL_DIR_PATH-------------------------------------------
Exists result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/mel
RESULT_WAV_DIR_PATH-------------------------------------------
Exists result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/wav
RESULT_JSON_PATH-------------------------------------------
No exist result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/eval4mid.json


## [seq] load Model

In [17]:
# import models
from gradtts import GradTTS
from gradseptts import GradSepTTS
from gradtfktts import GradTFKTTS
from gradtfk5tts import GradTFKTTS as GradTFK5TTS
from gradtimektts import GradTimeKTTS
from gradfreqktts import GradFreqKTTS
from gradtfkfultts import GradTFKFULTTS

print(model_name)
print("[seq] loading Model")

print("loading diffusion-TTS ===================================")
N_STEP = 50
TEMP = 1.5

print('loading ', ckpt_path)
_, _, state_dict = torch.load(ckpt_path,
                            map_location=device)

#with open(config_path4model) as f:
#    config = yaml.load(f, yaml.SafeLoader)
config4model = toybox.load_yaml_and_expand_var(config_path4model)

print("[seq] Initializing diffusion-TTS...")
if model_name == "gradtts":
    model = GradTTS.build_model(config4model, vocab_size)
elif model_name == "gradseptts":
    model = GradSepTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfktts":
    model = GradTFKTTS.build_model(config4model, vocab_size)
elif model_name == "gradtfk5tts":
    model = GradTFK5TTS.build_model(config4model, vocab_size)
elif model_name == "gradtfkfultts":
    model = GradTFKFULTTS.build_model(config4model, vocab_size)
elif model_name == "gradtimektts":
    model = GradTimeKTTS.build_model(config4model, vocab_size)
elif model_name == "gradfreqktts":
    model = GradFreqKTTS.build_model(config4model, vocab_size)
else:
    raise ValueError(f"Error: '{model_name}' is not supported")

model = model.to(device)
model.load_state_dict(state_dict)
print(f'Number of encoder + duration predictor parameters: {model.encoder.nparams/1e6}m')
print(f'Number of decoder parameters: {model.decoder.nparams/1e6}m')
print(f'Total parameters: {model.nparams/1e6}m')

print("loading HiFi-GAN ===================================")
"""
#setting file paths
# from https://github.com/huawei-noah/Speech-Backbones/tree/main/Grad-TTS/hifi-gan
# https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing
HiFiGAN_CONFIG = './hifigan/official_pretrained/LJ_V2/config.json'
HiFiGAN_ckpt = './hifigan/official_pretrained/LJ_V2/generator_v2'

from hifigan import models, env

with open(HiFiGAN_CONFIG) as f:
    hifigan_hparams = env.AttrDict(json.load(f))

hifigan_randomseed = hifigan_hparams.seed
print(f'hifigan_randomseed: {hifigan_randomseed}')
"""
# generator ===================
print("[seq] loading HiFiGAN")
vocoder = models.Generator(hifigan_hparams)

vocoder.load_state_dict(torch.load(
    HiFiGAN_ckpt, map_location=device)['generator'])
vocoder = vocoder.eval().to(device)
vocoder.remove_weight_norm()

print("loading UTMOS ===================================")
predictor_utmos = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)



gradtfkfultts
[seq] loading Model
loading  logs4model/gradtfkfultts/run_tfkfulmask_k3/ckpt/gradtfkfulttsmask_500_396308.pt
[seq] Initializing diffusion-TTS...
Number of encoder + duration predictor parameters: 3.549137m
Number of decoder parameters: 2.056663m
Total parameters: 5.6058m
[seq] loading HiFiGAN




Removing weight norm...


Using cache found in /work/sora-sa/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


In [18]:
"""
ttext = test_ds_list[0]['text']
tphonemes, tphnid = text2phnid(ttext, phn2id, 'en')
print(tphonemes)
print(len(tphonemes))
print(len(tphnid))

tphnid_len = torch.tensor(len(tphnid), dtype=torch.long).unsqueeze(0).to(device)
tphnid = torch.tensor(tphnid).unsqueeze(0).to(device)
print(tphnid_len)
print(tphnid)
"""

"\nttext = test_ds_list[0]['text']\ntphonemes, tphnid = text2phnid(ttext, phn2id, 'en')\nprint(tphonemes)\nprint(len(tphonemes))\nprint(len(tphnid))\n\ntphnid_len = torch.tensor(len(tphnid), dtype=torch.long).unsqueeze(0).to(device)\ntphnid = torch.tensor(tphnid).unsqueeze(0).to(device)\nprint(tphnid_len)\nprint(tphnid)\n"

## [seq] infer

In [19]:
infer_data_num: int = 101 #len(test_ds_list) is 200
print(f'infer_data_num: {infer_data_num}')
print(f'RESULT_DIR_PATH: {RESULT_DIR_PATH}')
print(f'RESULT_MEL_DIR_PATH: {RESULT_MEL_DIR_PATH}')
print(f'RESULT_WAV_DIR_PATH: {RESULT_WAV_DIR_PATH}')
print(f'RESULT_JSON_PATH: {RESULT_JSON_PATH}')

infer_data_num: 101
RESULT_DIR_PATH: result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500
RESULT_MEL_DIR_PATH: result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/mel
RESULT_WAV_DIR_PATH: result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/wav
RESULT_JSON_PATH: result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/eval4mid.json


In [20]:
#a_num = range(90,101)
#print(range(infer_data_num)[-1])
#print(a_num[-1])

In [21]:
eval_list = []


for i in tqdm(range(infer_data_num)):
    test_ds_filename = test_ds_list[i]['name']
    mel_npy_path = RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    synth_wav_path = RESULT_WAV_DIR_PATH / f"{test_ds_filename}.wav"
    print(f'test_ds_index_{i}: {test_ds_filename}')
    # [seq]text2mel =========================================================
    # load txt
    print('[seq]text2mel')
    text = test_ds_list[i]['text']
    phonemes, phnid = text2phnid(text, phn2id, 'en')
    phonemes_len_int = len(phonemes)
    phnid_len_int = len(phnid)
    print(f'phonemes_len: {phonemes_len_int}')
    print(f'phnid_len: {phnid_len_int}')
    phnid_len = torch.tensor(len(phnid), dtype=torch.long).unsqueeze(0).to(device)
    phnid = torch.tensor(phnid).unsqueeze(0).to(device)

    # [seq] synth speech
    # process text to mel
    # mel is [n_mels, n_frame]
    start_time = time.perf_counter()
    _, mel_prediction, _ = model.forward(phnid,
                                        phnid_len,
                                        n_timesteps=N_STEP,
                                        temperature=TEMP,
                                        solver='original')
    end_time = time.perf_counter()

    dt = end_time - start_time
    dt4mel = dt * 22050 / ( mel_prediction.shape[-1] * 256)
    print(f'{model_name} dt: {dt}')
    print(f'{model_name} RTF: {dt4mel}')
    
    # for save mel
    mel4save = mel_prediction.unsqueeze(0) # [batch, channel(freq), n_frame(time)] ex.[1, 80, 619]
    # save
    #mel_npy_path =  RESULT_MEL_DIR_PATH / f"{test_ds_filename}.npy"
    #print(f'test_ds_index_{i}: {mel_npy_path}')
    np.save(mel_npy_path, mel4save.cpu().detach().numpy().copy())

    # [seq]mel2wav =========================================================
    print('[seq]mel2wav')
    x = np.load(mel_npy_path) # [1, n_mel, n_frame]
    x2audio = torch.FloatTensor(x).to(device)
    x2audio = x2audio.squeeze().unsqueeze(0)
    # x2audio is [1, n_mels, n_frames]
    assert x2audio.shape[0] == 1
    with torch.no_grad():
        # vocoder.forward(x).cpu() is torch.Size([1, 1, 167168])
        audio = (vocoder.forward(x2audio).cpu().squeeze().clamp(-1,1).numpy() * 32768).astype(np.int16)
    write(
        synth_wav_path,
        hifigan_hparams.sampling_rate,
        audio)

    # [seq]wav2utmos =========================================================
    print('[seq]wav2utmos')
    #iwav_path = RESULT_WAV_DIR_PATH / f"{filename}.wav"
    #wav, samplerate = torchaudio.load(iwav_path)
    wav, samplerate = torchaudio.load(synth_wav_path)
    score_utmos = predictor_utmos(wav, samplerate)
    score_utmos_float = score_utmos.item()
    print(f'utmos: {score_utmos_float}')
    #eval_dict = {'name': filename, 'path': str(iwav_path), 'utmos': score_float}
    #score_utmos_list.append(eval_dict)
    
    # path, テキスト文、phonimes, phonimes数, dt, RTF, utmos
    eval_dict = {
        'name': test_ds_filename,
        'phonemes_len': phonemes_len_int,
        'phnid_len': phnid_len_int,
        'dt': dt,
        'RTF4mel': dt4mel,
        'utmos': score_utmos_float
    }
    eval_list.append(eval_dict)
    

  0%|                                                                                                     | 0/101 [00:00<?, ?it/s]

test_ds_index_0: LJ045-0049
[seq]text2mel
phonemes_len: 1127
phnid_len: 215
gradtfkfultts dt: 22.235576699487865
gradtfkfultts RTF: 2.919531644338959
[seq]mel2wav
[seq]wav2utmos


  1%|▉                                                                                          | 1/101 [00:42<1:10:08, 42.08s/it]

utmos: 3.0999863147735596
test_ds_index_1: LJ017-0027
[seq]text2mel
phonemes_len: 411
phnid_len: 79
gradtfkfultts dt: 7.859424853697419
gradtfkfultts RTF: 2.5739709782561206
[seq]mel2wav
[seq]wav2utmos


  2%|█▊                                                                                           | 2/101 [00:52<39:03, 23.67s/it]

utmos: 3.902704954147339
test_ds_index_2: LJ023-0031
[seq]text2mel
phonemes_len: 223
phnid_len: 45
gradtfkfultts dt: 4.456846786662936
gradtfkfultts RTF: 2.7033855536399027
[seq]mel2wav


  3%|██▊                                                                                          | 3/101 [00:59<25:35, 15.66s/it]

[seq]wav2utmos
utmos: 3.1458921432495117
test_ds_index_3: LJ032-0046
[seq]text2mel
phonemes_len: 650
phnid_len: 125
gradtfkfultts dt: 11.059734380804002
gradtfkfultts RTF: 2.075394395907614
[seq]mel2wav
[seq]wav2utmos


  4%|███▋                                                                                         | 4/101 [01:13<24:51, 15.38s/it]

utmos: 3.5897765159606934
test_ds_index_4: LJ030-0026
[seq]text2mel
phonemes_len: 1025
phnid_len: 195
gradtfkfultts dt: 15.682630959898233
gradtfkfultts RTF: 2.225352738015831
[seq]mel2wav
[seq]wav2utmos


  5%|████▌                                                                                        | 5/101 [01:34<27:51, 17.41s/it]

utmos: 2.734996795654297
test_ds_index_5: LJ020-0041
[seq]text2mel
phonemes_len: 1154
phnid_len: 219
gradtfkfultts dt: 20.763847924768925
gradtfkfultts RTF: 2.334789321256705
[seq]mel2wav
[seq]wav2utmos


  6%|█████▌                                                                                       | 6/101 [02:02<32:49, 20.73s/it]

utmos: 3.271646022796631
test_ds_index_6: LJ001-0070
[seq]text2mel
phonemes_len: 1137
phnid_len: 215
gradtfkfultts dt: 18.73468187265098
gradtfkfultts RTF: 2.1924875556850485
[seq]mel2wav
[seq]wav2utmos


  7%|██████▍                                                                                      | 7/101 [02:27<34:38, 22.11s/it]

utmos: 4.015120983123779
test_ds_index_7: LJ019-0334
[seq]text2mel
phonemes_len: 611
phnid_len: 117
gradtfkfultts dt: 8.748563108034432
gradtfkfultts RTF: 2.159135661400421
[seq]mel2wav
[seq]wav2utmos


  8%|███████▎                                                                                     | 8/101 [02:39<29:20, 18.93s/it]

utmos: 3.443108558654785
test_ds_index_8: LJ022-0152
[seq]text2mel
phonemes_len: 387
phnid_len: 75
gradtfkfultts dt: 7.2682701563462615
gradtfkfultts RTF: 2.227888080341346
[seq]mel2wav
[seq]wav2utmos


  9%|████████▎                                                                                    | 9/101 [02:49<24:45, 16.15s/it]

utmos: 2.9242942333221436
test_ds_index_9: LJ050-0154
[seq]text2mel
phonemes_len: 896
phnid_len: 171
gradtfkfultts dt: 14.607130303047597
gradtfkfultts RTF: 2.1109953281131997
[seq]mel2wav
[seq]wav2utmos


 10%|█████████                                                                                   | 10/101 [03:08<26:05, 17.20s/it]

utmos: 3.092604398727417
test_ds_index_10: LJ016-0045
[seq]text2mel
phonemes_len: 1237
phnid_len: 235
gradtfkfultts dt: 23.715348546393216
gradtfkfultts RTF: 2.437553305153502
[seq]mel2wav
[seq]wav2utmos


 11%|██████████                                                                                  | 11/101 [03:39<32:02, 21.37s/it]

utmos: 3.841552495956421
test_ds_index_11: LJ036-0100
[seq]text2mel
phonemes_len: 802
phnid_len: 153
gradtfkfultts dt: 14.49691683985293
gradtfkfultts RTF: 2.32525180632243
[seq]mel2wav
[seq]wav2utmos


 12%|██████████▉                                                                                 | 12/101 [03:58<30:40, 20.68s/it]

utmos: 3.8899331092834473
test_ds_index_12: LJ046-0016
[seq]text2mel
phonemes_len: 884
phnid_len: 167
gradtfkfultts dt: 12.339276042766869
gradtfkfultts RTF: 2.063721455878409
[seq]mel2wav
[seq]wav2utmos


 13%|███████████▊                                                                                | 13/101 [04:15<28:44, 19.59s/it]

utmos: 3.6445813179016113
test_ds_index_13: LJ048-0085
[seq]text2mel
phonemes_len: 1181
phnid_len: 223
gradtfkfultts dt: 18.278586951084435
gradtfkfultts RTF: 2.0852796061227847
[seq]mel2wav
[seq]wav2utmos


 14%|████████████▊                                                                               | 14/101 [04:40<30:36, 21.11s/it]

utmos: 3.8661506175994873
test_ds_index_14: LJ050-0197
[seq]text2mel
phonemes_len: 282
phnid_len: 55
gradtfkfultts dt: 5.4214661521837115
gradtfkfultts RTF: 2.497145067171851
[seq]mel2wav


 15%|█████████████▋                                                                              | 15/101 [04:47<24:23, 17.01s/it]

[seq]wav2utmos
utmos: 4.241243362426758
test_ds_index_15: LJ050-0178
[seq]text2mel
phonemes_len: 709
phnid_len: 135
gradtfkfultts dt: 11.231637702323496
gradtfkfultts RTF: 2.0939665460652823
[seq]mel2wav
[seq]wav2utmos


 16%|██████████████▌                                                                             | 16/101 [05:03<23:26, 16.54s/it]

utmos: 3.31962251663208
test_ds_index_16: LJ043-0079
[seq]text2mel
phonemes_len: 1445
phnid_len: 273
gradtfkfultts dt: 27.84575880598277
gradtfkfultts RTF: 2.7100943753174436
[seq]mel2wav
[seq]wav2utmos


 17%|███████████████▍                                                                            | 17/101 [05:38<30:58, 22.12s/it]

utmos: 3.6991138458251953
test_ds_index_17: LJ050-0207
[seq]text2mel
phonemes_len: 737
phnid_len: 141
gradtfkfultts dt: 12.418150250799954
gradtfkfultts RTF: 2.1918241949774186
[seq]mel2wav
[seq]wav2utmos


 18%|████████████████▍                                                                           | 18/101 [05:54<28:15, 20.43s/it]

utmos: 3.410460948944092
test_ds_index_18: LJ034-0005
[seq]text2mel
phonemes_len: 1442
phnid_len: 273
gradtfkfultts dt: 30.8676890572533
gradtfkfultts RTF: 2.94431990462536
[seq]mel2wav
[seq]wav2utmos


 19%|█████████████████▎                                                                          | 19/101 [06:33<35:18, 25.84s/it]

utmos: 3.4737370014190674
test_ds_index_19: LJ031-0151
[seq]text2mel
phonemes_len: 1203
phnid_len: 227
gradtfkfultts dt: 19.6567889675498
gradtfkfultts RTF: 2.2848778927045013
[seq]mel2wav
[seq]wav2utmos


 20%|██████████████████▏                                                                         | 20/101 [06:58<34:43, 25.72s/it]

utmos: 3.2792251110076904
test_ds_index_20: LJ023-0021
[seq]text2mel
phonemes_len: 564
phnid_len: 109
gradtfkfultts dt: 8.960067794658244
gradtfkfultts RTF: 2.16785348130502
[seq]mel2wav
[seq]wav2utmos


 21%|███████████████████▏                                                                        | 21/101 [07:11<28:54, 21.68s/it]

utmos: 3.8977208137512207
test_ds_index_21: LJ015-0301
[seq]text2mel
phonemes_len: 780
phnid_len: 149
gradtfkfultts dt: 13.267137363553047
gradtfkfultts RTF: 2.062700099181695
[seq]mel2wav
[seq]wav2utmos


 22%|████████████████████                                                                        | 22/101 [07:29<27:07, 20.60s/it]

utmos: 3.279144287109375
test_ds_index_22: LJ021-0153
[seq]text2mel
phonemes_len: 320
phnid_len: 63
gradtfkfultts dt: 5.972881907597184
gradtfkfultts RTF: 2.3817644325542155
[seq]mel2wav


 23%|████████████████████▉                                                                       | 23/101 [07:37<21:59, 16.92s/it]

[seq]wav2utmos
utmos: 2.633690357208252
test_ds_index_23: LJ014-0037
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradtfkfultts dt: 7.521081945858896
gradtfkfultts RTF: 2.2415638098262956
[seq]mel2wav
[seq]wav2utmos


 24%|█████████████████████▊                                                                      | 24/101 [07:47<19:10, 14.95s/it]

utmos: 2.7266695499420166
test_ds_index_24: LJ004-0200
[seq]text2mel
phonemes_len: 1097
phnid_len: 207
gradtfkfultts dt: 18.076144695281982
gradtfkfultts RTF: 2.0621843467040963
[seq]mel2wav
[seq]wav2utmos


 25%|██████████████████████▊                                                                     | 25/101 [08:07<20:39, 16.31s/it]

utmos: 2.9963247776031494
test_ds_index_25: LJ049-0010
[seq]text2mel
phonemes_len: 471
phnid_len: 91
gradtfkfultts dt: 8.351549501530826
gradtfkfultts RTF: 2.2982186814690833
[seq]mel2wav
[seq]wav2utmos


 26%|███████████████████████▋                                                                    | 26/101 [08:18<18:36, 14.89s/it]

utmos: 3.275547504425049
test_ds_index_26: LJ008-0291
[seq]text2mel
phonemes_len: 365
phnid_len: 71
gradtfkfultts dt: 6.4692652598023415
gradtfkfultts RTF: 2.3610847950649103
[seq]mel2wav
[seq]wav2utmos


 27%|████████████████████████▌                                                                   | 27/101 [08:27<16:10, 13.11s/it]

utmos: 3.485034704208374
test_ds_index_27: LJ048-0221
[seq]text2mel
phonemes_len: 602
phnid_len: 115
gradtfkfultts dt: 10.370465253479779
gradtfkfultts RTF: 2.1420559693422994
[seq]mel2wav
[seq]wav2utmos


 28%|█████████████████████████▌                                                                  | 28/101 [08:41<16:18, 13.40s/it]

utmos: 3.202763080596924
test_ds_index_28: LJ004-0157
[seq]text2mel
phonemes_len: 637
phnid_len: 121
gradtfkfultts dt: 10.619389531202614
gradtfkfultts RTF: 2.127157877571018
[seq]mel2wav
[seq]wav2utmos


 29%|██████████████████████████▍                                                                 | 29/101 [08:56<16:32, 13.79s/it]

utmos: 3.3666017055511475
test_ds_index_29: LJ013-0175
[seq]text2mel
phonemes_len: 681
phnid_len: 131
gradtfkfultts dt: 10.879989729262888
gradtfkfultts RTF: 2.134679078251768
[seq]mel2wav
[seq]wav2utmos


 30%|███████████████████████████▎                                                                | 30/101 [09:11<16:42, 14.12s/it]

utmos: 3.5262491703033447
test_ds_index_30: LJ021-0100
[seq]text2mel
phonemes_len: 752
phnid_len: 143
gradtfkfultts dt: 12.837263561785221
gradtfkfultts RTF: 2.1428480918223425
[seq]mel2wav
[seq]wav2utmos


 31%|████████████████████████████▏                                                               | 31/101 [09:28<17:28, 14.98s/it]

utmos: 3.288264036178589
test_ds_index_31: LJ018-0132
[seq]text2mel
phonemes_len: 687
phnid_len: 131
gradtfkfultts dt: 12.066165769472718
gradtfkfultts RTF: 2.0995814016483068
[seq]mel2wav
[seq]wav2utmos


 32%|█████████████████████████████▏                                                              | 32/101 [09:45<17:47, 15.48s/it]

utmos: 3.7690274715423584
test_ds_index_32: LJ023-0059
[seq]text2mel
phonemes_len: 1184
phnid_len: 225
gradtfkfultts dt: 18.87679046485573
gradtfkfultts RTF: 2.153524574451929
[seq]mel2wav
[seq]wav2utmos


 33%|██████████████████████████████                                                              | 33/101 [10:05<19:11, 16.94s/it]

utmos: 3.0512521266937256
test_ds_index_33: LJ003-0027
[seq]text2mel
phonemes_len: 964
phnid_len: 183
gradtfkfultts dt: 15.97326840274036
gradtfkfultts RTF: 2.2554467743367375
[seq]mel2wav
[seq]wav2utmos


 34%|██████████████████████████████▉                                                             | 34/101 [10:26<20:23, 18.26s/it]

utmos: 3.4452295303344727
test_ds_index_34: LJ018-0133
[seq]text2mel
phonemes_len: 674
phnid_len: 129
gradtfkfultts dt: 11.075541735626757
gradtfkfultts RTF: 2.143747325057672
[seq]mel2wav
[seq]wav2utmos


 35%|███████████████████████████████▉                                                            | 35/101 [10:41<19:00, 17.27s/it]

utmos: 3.835557699203491
test_ds_index_35: LJ033-0060
[seq]text2mel
phonemes_len: 596
phnid_len: 115
gradtfkfultts dt: 10.427914343774319
gradtfkfultts RTF: 2.1233701913436738
[seq]mel2wav
[seq]wav2utmos


 36%|████████████████████████████████▊                                                           | 36/101 [10:55<17:36, 16.26s/it]

utmos: 3.475552558898926
test_ds_index_36: LJ003-0299
[seq]text2mel
phonemes_len: 881
phnid_len: 167
gradtfkfultts dt: 15.535880943760276
gradtfkfultts RTF: 2.245216644884609
[seq]mel2wav
[seq]wav2utmos


 37%|█████████████████████████████████▋                                                          | 37/101 [11:12<17:31, 16.44s/it]

utmos: 2.778435230255127
test_ds_index_37: LJ011-0060
[seq]text2mel
phonemes_len: 1083
phnid_len: 205
gradtfkfultts dt: 16.975360076874495
gradtfkfultts RTF: 2.1953986585907153
[seq]mel2wav
[seq]wav2utmos


 38%|██████████████████████████████████▌                                                         | 38/101 [11:35<19:12, 18.29s/it]

utmos: 3.158228874206543
test_ds_index_38: LJ013-0240
[seq]text2mel
phonemes_len: 768
phnid_len: 147
gradtfkfultts dt: 12.319122682325542
gradtfkfultts RTF: 2.139275572905732
[seq]mel2wav
[seq]wav2utmos


 39%|███████████████████████████████████▌                                                        | 39/101 [11:52<18:27, 17.86s/it]

utmos: 3.373983383178711
test_ds_index_39: LJ047-0076
[seq]text2mel
phonemes_len: 1287
phnid_len: 245
gradtfkfultts dt: 29.278895157389343
gradtfkfultts RTF: 2.8953772523519796
[seq]mel2wav
[seq]wav2utmos


 40%|████████████████████████████████████▍                                                       | 40/101 [12:28<23:49, 23.44s/it]

utmos: 3.568474531173706
test_ds_index_40: LJ041-0133
[seq]text2mel
phonemes_len: 1018
phnid_len: 193
gradtfkfultts dt: 18.260501062497497
gradtfkfultts RTF: 2.16047845353317
[seq]mel2wav
[seq]wav2utmos


 41%|█████████████████████████████████████▎                                                      | 41/101 [12:52<23:42, 23.70s/it]

utmos: 3.338887929916382
test_ds_index_41: LJ038-0264
[seq]text2mel
phonemes_len: 1116
phnid_len: 211
gradtfkfultts dt: 17.440547910518944
gradtfkfultts RTF: 2.1834352370261554
[seq]mel2wav
[seq]wav2utmos


 42%|██████████████████████████████████████▎                                                     | 42/101 [13:16<23:13, 23.62s/it]

utmos: 3.8065319061279297
test_ds_index_42: LJ011-0016
[seq]text2mel
phonemes_len: 609
phnid_len: 117
gradtfkfultts dt: 9.799648409709334
gradtfkfultts RTF: 2.192392932569915
[seq]mel2wav
[seq]wav2utmos


 43%|███████████████████████████████████████▏                                                    | 43/101 [13:29<19:42, 20.39s/it]

utmos: 4.085221290588379
test_ds_index_43: LJ003-0185
[seq]text2mel
phonemes_len: 1105
phnid_len: 209
gradtfkfultts dt: 16.790057993493974
gradtfkfultts RTF: 2.145660114269648
[seq]mel2wav
[seq]wav2utmos


 44%|████████████████████████████████████████                                                    | 44/101 [13:51<19:59, 21.04s/it]

utmos: 3.5681138038635254
test_ds_index_44: LJ014-0063
[seq]text2mel
phonemes_len: 793
phnid_len: 151
gradtfkfultts dt: 12.17853984888643
gradtfkfultts RTF: 2.0895854369081936
[seq]mel2wav
[seq]wav2utmos


 45%|████████████████████████████████████████▉                                                   | 45/101 [14:08<18:28, 19.79s/it]

utmos: 3.665069341659546
test_ds_index_45: LJ005-0185
[seq]text2mel
phonemes_len: 588
phnid_len: 113
gradtfkfultts dt: 10.266278700903058
gradtfkfultts RTF: 2.1359020734725283
[seq]mel2wav
[seq]wav2utmos


 46%|█████████████████████████████████████████▉                                                  | 46/101 [14:22<16:32, 18.05s/it]

utmos: 3.346945285797119
test_ds_index_46: LJ014-0135
[seq]text2mel
phonemes_len: 826
phnid_len: 157
gradtfkfultts dt: 13.002026121132076
gradtfkfultts RTF: 2.085476867805533
[seq]mel2wav
[seq]wav2utmos


 47%|██████████████████████████████████████████▊                                                 | 47/101 [14:36<15:13, 16.91s/it]

utmos: 3.020235300064087
test_ds_index_47: LJ009-0046
[seq]text2mel
phonemes_len: 1011
phnid_len: 193
gradtfkfultts dt: 16.756892133504152
gradtfkfultts RTF: 2.0767169039105586
[seq]mel2wav
[seq]wav2utmos


 48%|███████████████████████████████████████████▋                                                | 48/101 [14:59<16:31, 18.71s/it]

utmos: 3.9311671257019043
test_ds_index_48: LJ037-0024
[seq]text2mel
phonemes_len: 1505
phnid_len: 285
gradtfkfultts dt: 26.13024435378611
gradtfkfultts RTF: 2.420076814520261
[seq]mel2wav
[seq]wav2utmos


 49%|████████████████████████████████████████████▋                                               | 49/101 [15:33<20:07, 23.22s/it]

utmos: 3.2915730476379395
test_ds_index_49: LJ002-0217
[seq]text2mel
phonemes_len: 1093
phnid_len: 207
gradtfkfultts dt: 16.659184618853033
gradtfkfultts RTF: 2.221211184486923
[seq]mel2wav
[seq]wav2utmos


 50%|█████████████████████████████████████████████▌                                              | 50/101 [15:55<19:27, 22.89s/it]

utmos: 3.149360179901123
test_ds_index_50: LJ044-0017
[seq]text2mel
phonemes_len: 782
phnid_len: 149
gradtfkfultts dt: 13.483509779907763
gradtfkfultts RTF: 2.0963404687991183
[seq]mel2wav
[seq]wav2utmos


 50%|██████████████████████████████████████████████▍                                             | 51/101 [16:10<17:02, 20.45s/it]

utmos: 2.6667089462280273
test_ds_index_51: LJ017-0074
[seq]text2mel
phonemes_len: 481
phnid_len: 93
gradtfkfultts dt: 8.881651354022324
gradtfkfultts RTF: 2.2368468151063623
[seq]mel2wav
[seq]wav2utmos


 51%|███████████████████████████████████████████████▎                                            | 52/101 [16:22<14:37, 17.90s/it]

utmos: 3.4528586864471436
test_ds_index_52: LJ033-0153
[seq]text2mel
phonemes_len: 714
phnid_len: 137
gradtfkfultts dt: 11.785239203833044
gradtfkfultts RTF: 2.1191979094183733
[seq]mel2wav
[seq]wav2utmos


 52%|████████████████████████████████████████████████▎                                           | 53/101 [16:38<13:52, 17.35s/it]

utmos: 2.720731258392334
test_ds_index_53: LJ032-0124
[seq]text2mel
phonemes_len: 1056
phnid_len: 199
gradtfkfultts dt: 18.2145540677011
gradtfkfultts RTF: 2.24766586000632
[seq]mel2wav
[seq]wav2utmos


 53%|█████████████████████████████████████████████████▏                                          | 54/101 [17:02<15:11, 19.40s/it]

utmos: 3.4279897212982178
test_ds_index_54: LJ018-0287
[seq]text2mel
phonemes_len: 567
phnid_len: 109
gradtfkfultts dt: 9.917067611590028
gradtfkfultts RTF: 2.1846161768258483
[seq]mel2wav
[seq]wav2utmos


 54%|██████████████████████████████████████████████████                                          | 55/101 [17:16<13:32, 17.67s/it]

utmos: 2.846484661102295
test_ds_index_55: LJ020-0038
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradtfkfultts dt: 10.672114712186158
gradtfkfultts RTF: 2.152738303239395
[seq]mel2wav
[seq]wav2utmos


 55%|███████████████████████████████████████████████████                                         | 56/101 [17:30<12:31, 16.70s/it]

utmos: 3.068068742752075
test_ds_index_56: LJ001-0007
[seq]text2mel
phonemes_len: 1016
phnid_len: 193
gradtfkfultts dt: 19.562993730418384
gradtfkfultts RTF: 2.261766001235976
[seq]mel2wav
[seq]wav2utmos


 56%|███████████████████████████████████████████████████▉                                        | 57/101 [17:56<14:15, 19.44s/it]

utmos: 2.689732551574707
test_ds_index_57: LJ003-0313
[seq]text2mel
phonemes_len: 1175
phnid_len: 223
gradtfkfultts dt: 19.33674688078463
gradtfkfultts RTF: 2.2356085818021243
[seq]mel2wav
[seq]wav2utmos


 57%|████████████████████████████████████████████████████▊                                       | 58/101 [18:17<14:15, 19.88s/it]

utmos: 3.5600366592407227
test_ds_index_58: LJ019-0265
[seq]text2mel
phonemes_len: 590
phnid_len: 113
gradtfkfultts dt: 11.356301086954772
gradtfkfultts RTF: 2.1080822246039257
[seq]mel2wav
[seq]wav2utmos


 58%|█████████████████████████████████████████████████████▋                                      | 59/101 [18:32<12:59, 18.56s/it]

utmos: 3.5137383937835693
test_ds_index_59: LJ038-0281
[seq]text2mel
phonemes_len: 1193
phnid_len: 225
gradtfkfultts dt: 20.55672716908157
gradtfkfultts RTF: 2.308485954195774
[seq]mel2wav
[seq]wav2utmos


 59%|██████████████████████████████████████████████████████▋                                     | 60/101 [18:59<14:23, 21.06s/it]

utmos: 3.5842416286468506
test_ds_index_60: LJ045-0235
[seq]text2mel
phonemes_len: 1185
phnid_len: 223
gradtfkfultts dt: 18.55254962667823
gradtfkfultts RTF: 2.150717736731657
[seq]mel2wav
[seq]wav2utmos


 60%|███████████████████████████████████████████████████████▌                                    | 61/101 [19:24<14:42, 22.06s/it]

utmos: 3.5342090129852295
test_ds_index_61: LJ038-0255
[seq]text2mel
phonemes_len: 240
phnid_len: 47
gradtfkfultts dt: 5.112157329916954
gradtfkfultts RTF: 2.652557161254444
[seq]mel2wav


 61%|████████████████████████████████████████████████████████▍                                   | 62/101 [19:30<11:21, 17.47s/it]

[seq]wav2utmos
utmos: 3.1898610591888428
test_ds_index_62: LJ028-0205
[seq]text2mel
phonemes_len: 899
phnid_len: 171
gradtfkfultts dt: 16.004083139821887
gradtfkfultts RTF: 2.0760191149347738
[seq]mel2wav
[seq]wav2utmos


 62%|█████████████████████████████████████████████████████████▍                                  | 63/101 [19:52<11:51, 18.72s/it]

utmos: 2.7211413383483887
test_ds_index_63: LJ014-0260
[seq]text2mel
phonemes_len: 1068
phnid_len: 203
gradtfkfultts dt: 18.688289468176663
gradtfkfultts RTF: 2.0584078423378327
[seq]mel2wav
[seq]wav2utmos


 63%|██████████████████████████████████████████████████████████▎                                 | 64/101 [20:17<12:46, 20.70s/it]

utmos: 3.187817335128784
test_ds_index_64: LJ033-0166
[seq]text2mel
phonemes_len: 967
phnid_len: 183
gradtfkfultts dt: 15.810818523168564
gradtfkfultts RTF: 2.171978097811172
[seq]mel2wav
[seq]wav2utmos


 64%|███████████████████████████████████████████████████████████▏                                | 65/101 [20:38<12:23, 20.64s/it]

utmos: 3.4041857719421387
test_ds_index_65: LJ037-0125
[seq]text2mel
phonemes_len: 1085
phnid_len: 205
gradtfkfultts dt: 17.42655210569501
gradtfkfultts RTF: 2.044956328394153
[seq]mel2wav
[seq]wav2utmos


 65%|████████████████████████████████████████████████████████████                                | 66/101 [21:01<12:29, 21.42s/it]

utmos: 3.225071430206299
test_ds_index_66: LJ013-0142
[seq]text2mel
phonemes_len: 740
phnid_len: 143
gradtfkfultts dt: 12.359278459101915
gradtfkfultts RTF: 2.0996832626294166
[seq]mel2wav
[seq]wav2utmos


 66%|█████████████████████████████████████████████████████████████                               | 67/101 [21:17<11:14, 19.85s/it]

utmos: 3.173292636871338
test_ds_index_67: LJ031-0199
[seq]text2mel
phonemes_len: 1439
phnid_len: 273
gradtfkfultts dt: 23.411131920292974
gradtfkfultts RTF: 2.3204449207173297
[seq]mel2wav
[seq]wav2utmos


 67%|█████████████████████████████████████████████████████████████▉                              | 68/101 [21:48<12:44, 23.16s/it]

utmos: 3.8213415145874023
test_ds_index_68: LJ004-0017
[seq]text2mel
phonemes_len: 619
phnid_len: 119
gradtfkfultts dt: 10.568554388359189
gradtfkfultts RTF: 2.1071743368717457
[seq]mel2wav
[seq]wav2utmos


 68%|██████████████████████████████████████████████████████████████▊                             | 69/101 [22:03<10:57, 20.55s/it]

utmos: 2.9142839908599854
test_ds_index_69: LJ024-0115
[seq]text2mel
phonemes_len: 1011
phnid_len: 191
gradtfkfultts dt: 16.77158183977008
gradtfkfultts RTF: 2.063690734190459
[seq]mel2wav
[seq]wav2utmos


 69%|███████████████████████████████████████████████████████████████▊                            | 70/101 [22:25<10:56, 21.16s/it]

utmos: 2.9027106761932373
test_ds_index_70: LJ017-0171
[seq]text2mel
phonemes_len: 620
phnid_len: 119
gradtfkfultts dt: 9.266595620661974
gradtfkfultts RTF: 2.1689074540972797
[seq]mel2wav
[seq]wav2utmos


 70%|████████████████████████████████████████████████████████████████▋                           | 71/101 [22:38<09:20, 18.67s/it]

utmos: 3.985438823699951
test_ds_index_71: LJ017-0040
[seq]text2mel
phonemes_len: 412
phnid_len: 79
gradtfkfultts dt: 8.487959979102015
gradtfkfultts RTF: 2.2154298951136298
[seq]mel2wav
[seq]wav2utmos


 71%|█████████████████████████████████████████████████████████████████▌                          | 72/101 [22:50<07:59, 16.55s/it]

utmos: 3.4157493114471436
test_ds_index_72: LJ005-0044
[seq]text2mel
phonemes_len: 796
phnid_len: 151
gradtfkfultts dt: 11.83420137502253
gradtfkfultts RTF: 2.0973519508684317
[seq]mel2wav
[seq]wav2utmos


 72%|██████████████████████████████████████████████████████████████████▍                         | 73/101 [23:06<07:39, 16.41s/it]

utmos: 3.2632060050964355
test_ds_index_73: LJ007-0169
[seq]text2mel
phonemes_len: 399
phnid_len: 77
gradtfkfultts dt: 6.9908887362107635
gradtfkfultts RTF: 2.3338949950558283
[seq]mel2wav


 73%|███████████████████████████████████████████████████████████████████▍                        | 74/101 [23:15<06:27, 14.34s/it]

[seq]wav2utmos
utmos: 4.052618503570557
test_ds_index_74: LJ015-0153
[seq]text2mel
phonemes_len: 1156
phnid_len: 219
gradtfkfultts dt: 18.09679610002786
gradtfkfultts RTF: 2.064540324946266
[seq]mel2wav
[seq]wav2utmos


 74%|████████████████████████████████████████████████████████████████████▎                       | 75/101 [23:35<06:53, 15.90s/it]

utmos: 3.5796775817871094
test_ds_index_75: LJ045-0043
[seq]text2mel
phonemes_len: 1166
phnid_len: 221
gradtfkfultts dt: 20.67538356781006
gradtfkfultts RTF: 2.291929132833674
[seq]mel2wav
[seq]wav2utmos


 75%|█████████████████████████████████████████████████████████████████████▏                      | 76/101 [24:02<08:01, 19.25s/it]

utmos: 3.0026848316192627
test_ds_index_76: LJ050-0010
[seq]text2mel
phonemes_len: 1046
phnid_len: 197
gradtfkfultts dt: 14.996941891498864
gradtfkfultts RTF: 2.090176025912406
[seq]mel2wav
[seq]wav2utmos


 76%|██████████████████████████████████████████████████████████████████████▏                     | 77/101 [24:22<07:51, 19.64s/it]

utmos: 3.5812175273895264
test_ds_index_77: LJ006-0126
[seq]text2mel
phonemes_len: 1061
phnid_len: 201
gradtfkfultts dt: 15.459552195854485
gradtfkfultts RTF: 2.0612611619496866
[seq]mel2wav
[seq]wav2utmos


 77%|███████████████████████████████████████████████████████████████████████                     | 78/101 [24:39<07:12, 18.79s/it]

utmos: 3.6824216842651367
test_ds_index_78: LJ018-0356
[seq]text2mel
phonemes_len: 979
phnid_len: 185
gradtfkfultts dt: 18.945736301131546
gradtfkfultts RTF: 2.3146802163117832
[seq]mel2wav
[seq]wav2utmos


 78%|███████████████████████████████████████████████████████████████████████▉                    | 79/101 [25:04<07:33, 20.60s/it]

utmos: 3.0140280723571777
test_ds_index_79: LJ040-0223
[seq]text2mel
phonemes_len: 1070
phnid_len: 201
gradtfkfultts dt: 17.72401320654899
gradtfkfultts RTF: 2.2483344716748275
[seq]mel2wav
[seq]wav2utmos


 79%|████████████████████████████████████████████████████████████████████████▊                   | 80/101 [25:27<07:28, 21.35s/it]

utmos: 3.1201436519622803
test_ds_index_80: LJ008-0281
[seq]text2mel
phonemes_len: 651
phnid_len: 125
gradtfkfultts dt: 11.264487527310848
gradtfkfultts RTF: 2.091038776074254
[seq]mel2wav
[seq]wav2utmos


 80%|█████████████████████████████████████████████████████████████████████████▊                  | 81/101 [25:40<06:13, 18.69s/it]

utmos: 4.239401817321777
test_ds_index_81: LJ008-0222
[seq]text2mel
phonemes_len: 515
phnid_len: 99
gradtfkfultts dt: 9.319150450639427
gradtfkfultts RTF: 2.217360879624907
[seq]mel2wav
[seq]wav2utmos


 81%|██████████████████████████████████████████████████████████████████████████▋                 | 82/101 [25:52<05:20, 16.89s/it]

utmos: 3.073042869567871
test_ds_index_82: LJ046-0123
[seq]text2mel
phonemes_len: 1130
phnid_len: 213
gradtfkfultts dt: 19.913827188313007
gradtfkfultts RTF: 2.2930935071769607
[seq]mel2wav
[seq]wav2utmos


 82%|███████████████████████████████████████████████████████████████████████████▌                | 83/101 [26:19<05:54, 19.70s/it]

utmos: 3.2047650814056396
test_ds_index_83: LJ030-0044
[seq]text2mel
phonemes_len: 965
phnid_len: 185
gradtfkfultts dt: 17.46389394905418
gradtfkfultts RTF: 2.1863579985955934
[seq]mel2wav
[seq]wav2utmos


 83%|████████████████████████████████████████████████████████████████████████████▌               | 84/101 [26:38<05:31, 19.53s/it]

utmos: 3.601828098297119
test_ds_index_84: LJ018-0051
[seq]text2mel
phonemes_len: 820
phnid_len: 157
gradtfkfultts dt: 13.974081612192094
gradtfkfultts RTF: 2.130313188252459
[seq]mel2wav
[seq]wav2utmos


 84%|█████████████████████████████████████████████████████████████████████████████▍              | 85/101 [26:57<05:09, 19.34s/it]

utmos: 2.849276542663574
test_ds_index_85: LJ042-0231
[seq]text2mel
phonemes_len: 574
phnid_len: 111
gradtfkfultts dt: 9.911272810772061
gradtfkfultts RTF: 2.205906466787023
[seq]mel2wav
[seq]wav2utmos


 85%|██████████████████████████████████████████████████████████████████████████████▎             | 86/101 [27:10<04:23, 17.57s/it]

utmos: 3.962049961090088
test_ds_index_86: LJ011-0121
[seq]text2mel
phonemes_len: 1123
phnid_len: 213
gradtfkfultts dt: 18.841360627673566
gradtfkfultts RTF: 2.190093633182577
[seq]mel2wav
[seq]wav2utmos


 86%|███████████████████████████████████████████████████████████████████████████████▏            | 87/101 [27:31<04:18, 18.45s/it]

utmos: 3.0234718322753906
test_ds_index_87: LJ016-0186
[seq]text2mel
phonemes_len: 848
phnid_len: 161
gradtfkfultts dt: 12.898097599856555
gradtfkfultts RTF: 2.1080634196871815
[seq]mel2wav
[seq]wav2utmos


 87%|████████████████████████████████████████████████████████████████████████████████▏           | 88/101 [27:48<03:55, 18.09s/it]

utmos: 3.370422601699829
test_ds_index_88: LJ011-0164
[seq]text2mel
phonemes_len: 1007
phnid_len: 191
gradtfkfultts dt: 19.330244716256857
gradtfkfultts RTF: 2.240872602590131
[seq]mel2wav
[seq]wav2utmos


 88%|█████████████████████████████████████████████████████████████████████████████████           | 89/101 [28:09<03:46, 18.89s/it]

utmos: 4.130599021911621
test_ds_index_89: LJ026-0039
[seq]text2mel
phonemes_len: 1150
phnid_len: 219
gradtfkfultts dt: 18.732779383659363
gradtfkfultts RTF: 2.291913315705394
[seq]mel2wav
[seq]wav2utmos


 89%|█████████████████████████████████████████████████████████████████████████████████▉          | 90/101 [28:33<03:47, 20.68s/it]

utmos: 2.8122923374176025
test_ds_index_90: LJ003-0105
[seq]text2mel
phonemes_len: 971
phnid_len: 185
gradtfkfultts dt: 16.098446179181337
gradtfkfultts RTF: 2.0945686499890748
[seq]mel2wav
[seq]wav2utmos


 90%|██████████████████████████████████████████████████████████████████████████████████▉         | 91/101 [28:55<03:30, 21.03s/it]

utmos: 3.964573621749878
test_ds_index_91: LJ039-0104
[seq]text2mel
phonemes_len: 383
phnid_len: 75
gradtfkfultts dt: 8.512548158876598
gradtfkfultts RTF: 2.2422315427086796
[seq]mel2wav
[seq]wav2utmos


 91%|███████████████████████████████████████████████████████████████████████████████████▊        | 92/101 [29:06<02:42, 18.10s/it]

utmos: 3.9262304306030273
test_ds_index_92: LJ002-0038
[seq]text2mel
phonemes_len: 642
phnid_len: 123
gradtfkfultts dt: 13.041138895787299
gradtfkfultts RTF: 2.115386009976091
[seq]mel2wav
[seq]wav2utmos


 92%|████████████████████████████████████████████████████████████████████████████████████▋       | 93/101 [29:24<02:23, 17.93s/it]

utmos: 3.6702463626861572
test_ds_index_93: LJ046-0194
[seq]text2mel
phonemes_len: 1259
phnid_len: 239
gradtfkfultts dt: 20.714876702055335
gradtfkfultts RTF: 2.2330795881586365
[seq]mel2wav
[seq]wav2utmos


 93%|█████████████████████████████████████████████████████████████████████████████████████▌      | 94/101 [29:51<02:23, 20.53s/it]

utmos: 3.764674663543701
test_ds_index_94: LJ008-0115
[seq]text2mel
phonemes_len: 845
phnid_len: 161
gradtfkfultts dt: 15.251690426841378
gradtfkfultts RTF: 2.1967742338514604
[seq]mel2wav
[seq]wav2utmos


 94%|██████████████████████████████████████████████████████████████████████████████████████▌     | 95/101 [30:11<02:02, 20.48s/it]

utmos: 3.5172877311706543
test_ds_index_95: LJ016-0104
[seq]text2mel
phonemes_len: 521
phnid_len: 101
gradtfkfultts dt: 8.87633551005274
gradtfkfultts RTF: 2.2032960869581113
[seq]mel2wav
[seq]wav2utmos


 95%|███████████████████████████████████████████████████████████████████████████████████████▍    | 96/101 [30:23<01:30, 18.04s/it]

utmos: 3.053208351135254
test_ds_index_96: LJ019-0301
[seq]text2mel
phonemes_len: 1122
phnid_len: 213
gradtfkfultts dt: 17.293665893375874
gradtfkfultts RTF: 2.086207397523514
[seq]mel2wav
[seq]wav2utmos


 96%|████████████████████████████████████████████████████████████████████████████████████████▎   | 97/101 [30:47<01:18, 19.60s/it]

utmos: 3.3363757133483887
test_ds_index_97: LJ028-0012
[seq]text2mel
phonemes_len: 1165
phnid_len: 219
gradtfkfultts dt: 21.906518695876002
gradtfkfultts RTF: 2.367465580125009
[seq]mel2wav
[seq]wav2utmos


 97%|█████████████████████████████████████████████████████████████████████████████████████████▎  | 98/101 [31:15<01:07, 22.34s/it]

utmos: 2.9106523990631104
test_ds_index_98: LJ018-0059
[seq]text2mel
phonemes_len: 1004
phnid_len: 191
gradtfkfultts dt: 17.267871035262942
gradtfkfultts RTF: 2.13390286679266
[seq]mel2wav
[seq]wav2utmos


 98%|██████████████████████████████████████████████████████████████████████████████████████████▏ | 99/101 [31:38<00:45, 22.56s/it]

utmos: 3.257174491882324
test_ds_index_99: LJ029-0081
[seq]text2mel
phonemes_len: 1427
phnid_len: 269
gradtfkfultts dt: 27.369109156541526
gradtfkfultts RTF: 2.39814684361386
[seq]mel2wav
[seq]wav2utmos


 99%|██████████████████████████████████████████████████████████████████████████████████████████ | 100/101 [32:14<00:26, 26.54s/it]

utmos: 3.9679741859436035
test_ds_index_100: LJ017-0230
[seq]text2mel
phonemes_len: 1085
phnid_len: 207
gradtfkfultts dt: 17.960323691368103
gradtfkfultts RTF: 2.184990385519656
[seq]mel2wav
[seq]wav2utmos


100%|███████████████████████████████████████████████████████████████████████████████████████████| 101/101 [32:38<00:00, 19.39s/it]

utmos: 3.4414823055267334





In [22]:
#RESULT_JSON_PATH = RESULT_DIR_PATH / 'eval4mid.json'
if RESULT_JSON_PATH.exists() == False:
    with open(RESULT_JSON_PATH, 'w') as f:
        for entry in eval_list:
            f.write(json.dumps(entry) + '\n')
    print(f'Make {RESULT_JSON_PATH}')
else:
    print(f'Already Exists {RESULT_JSON_PATH}')

Make result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/eval4mid.json


In [23]:
eval_list

[{'name': 'LJ045-0049',
  'phonemes_len': 1127,
  'phnid_len': 215,
  'dt': 22.235576699487865,
  'RTF4mel': 2.919531644338959,
  'utmos': 3.0999863147735596},
 {'name': 'LJ017-0027',
  'phonemes_len': 411,
  'phnid_len': 79,
  'dt': 7.859424853697419,
  'RTF4mel': 2.5739709782561206,
  'utmos': 3.902704954147339},
 {'name': 'LJ023-0031',
  'phonemes_len': 223,
  'phnid_len': 45,
  'dt': 4.456846786662936,
  'RTF4mel': 2.7033855536399027,
  'utmos': 3.1458921432495117},
 {'name': 'LJ032-0046',
  'phonemes_len': 650,
  'phnid_len': 125,
  'dt': 11.059734380804002,
  'RTF4mel': 2.075394395907614,
  'utmos': 3.5897765159606934},
 {'name': 'LJ030-0026',
  'phonemes_len': 1025,
  'phnid_len': 195,
  'dt': 15.682630959898233,
  'RTF4mel': 2.225352738015831,
  'utmos': 2.734996795654297},
 {'name': 'LJ020-0041',
  'phonemes_len': 1154,
  'phnid_len': 219,
  'dt': 20.763847924768925,
  'RTF4mel': 2.334789321256705,
  'utmos': 3.271646022796631},
 {'name': 'LJ001-0070',
  'phonemes_len': 1137,


## recheck eval_json

In [60]:
target_model = 'gradtfk5tts'
target_device = 'cpu'
target_epoch_str = 'e500'

print(target_model)
print(RESULT_DIR_PATH)

# /result4eval/infer4mid/gradtts/cpu/e500/eval4mid.json
# result4eval/<runtime_name>/<model_name>/<device_name>/<target_epoch>/eval.json
# or
# result4eval/json4mid/<target_epoch>/eval4mid_<model_shortword>_<kernel_size>.json
"""
eval_info = {
    'gradtts_cpu': 'result4eval/infer4mid/gradtts/cpu/eval4mid.json',
    'gradseptts_cpu': 'result4eval/infer4mid/gradseptts/cpu/eval4mid.json',
    'gradtfktts_cpu': 'result4eval/infer4mid/gradtfktts/cpu/eval4mid.json',
    'gradtfk5tts_cpu': 'result4eval/infer4mid/gradtfk5tts/cpu/eval4mid.json',
    'gradtimektts_cpu': 'result4eval/infer4mid/gradtimektts/cpu/eval4mid.json',
    'gradfreqktts_cpu': 'result4eval/infer4mid/gradfreqktts/cpu/eval4mid.json',
}
"""
"""
eval_info = {
    'cpu': {
        'e500': {
            'gradtts':f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradseptts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfktts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfk5tts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtimektts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradfreqktts': f'{RESULT_DIR_PATH}/eval4mid.json',
        },
        'e1000':{
            'gradtts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradseptts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfktts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtfk5tts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradtimektts': f'{RESULT_DIR_PATH}/eval4mid.json',
            'gradfreqktts': f'{RESULT_DIR_PATH}/eval4mid.json',
        }
    }
}
"""

eval_jsonl_path = RESULT_DIR_PATH / 'eval4mid.json'
print(f'eval_jsonl_path: {eval_jsonl_path}')

if eval_jsonl_path.exists() == True:
    print(f'Exist {eval_jsonl_path}')
    import json
    with open(eval_jsonl_path) as f:
        eval_jsonl_list = [json.loads(l) for l in f]
else:
    print(f'No Exists {eval_jsonl_path}')

gradtfk5tts
result4eval/run_tfk_k5/gradtfk5tts/cpu/e500
eval_jsonl_path: result4eval/run_tfk_k5/gradtfk5tts/cpu/e500/eval4mid.json
Exist result4eval/run_tfk_k5/gradtfk5tts/cpu/e500/eval4mid.json


In [61]:
eval_jsonl_list

[{'name': 'LJ045-0049',
  'phonemes_len': 1127,
  'phnid_len': 215,
  'dt': 18.630400406196713,
  'RTF4mel': 2.1684983580903587,
  'utmos': 3.7940917015075684},
 {'name': 'LJ017-0027',
  'phonemes_len': 411,
  'phnid_len': 79,
  'dt': 5.441780355758965,
  'RTF4mel': 1.8526318065168783,
  'utmos': 4.25128173828125},
 {'name': 'LJ023-0031',
  'phonemes_len': 223,
  'phnid_len': 45,
  'dt': 4.267865874804556,
  'RTF4mel': 1.9657930009074296,
  'utmos': 4.103524684906006},
 {'name': 'LJ032-0046',
  'phonemes_len': 650,
  'phnid_len': 125,
  'dt': 8.511940375901759,
  'RTF4mel': 1.6549827639023154,
  'utmos': 3.930527687072754},
 {'name': 'LJ030-0026',
  'phonemes_len': 1025,
  'phnid_len': 195,
  'dt': 12.719644932076335,
  'RTF4mel': 1.8824377869434814,
  'utmos': 3.6048166751861572},
 {'name': 'LJ020-0041',
  'phonemes_len': 1154,
  'phnid_len': 219,
  'dt': 15.45363965909928,
  'RTF4mel': 1.7491004562414747,
  'utmos': 3.749933958053589},
 {'name': 'LJ001-0070',
  'phonemes_len': 1137,


In [7]:
#'result4eval/run_tfkful_k3/gradtfkfultts/cpu/e500/eval4mid.json'
#result4eval/run_tfkful_k3/gradtfkfultts_e500/cpu/e500/eval4mid.json
#'result4eval/run_tfk_k5/gradtfk5tts/cpu/e500/eval4mid.json
#toybox.load_json('result4eval/infer4colb/gradtfk5tts/cpu/e500_n50/eval4midb.json')
a = toybox.load_json('result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/eval4mid.json')
#a = eval_list

Exist result4eval/run_tfkfulmask_k3/gradtfkfultts/cpu/e500/eval4mid.json


In [8]:
dt_list = [a[n]['dt'] for n in range(len(a))]
RTF4mel_list = [a[n]['RTF4mel'] for n in range(len(a))]
utmos_list = [a[n]['utmos'] for n in range(len(a))]

dt_nparr = np.array(dt_list[1:101])
RTF4mel_nparr = np.array(RTF4mel_list[1:101])
utmos_nparr = np.array(utmos_list[1:101])
print(len(dt_nparr))

significant_digits = 10

# for culc difference time to infer text2mel
dt_mean = toybox.round_significant_digits(np.mean(dt_nparr), significant_digits=significant_digits)
dt_var = toybox.round_significant_digits(np.var(dt_nparr), significant_digits=significant_digits)
dt_std = toybox.round_significant_digits(np.std(dt_nparr), significant_digits=significant_digits)
print(f'dt ---------------------------')
print(f'dt mean: {dt_mean}')
print(f'dt var: {dt_var}')
print(f'dt std: {dt_std}')

# for culc RTF4mel to infer text2mel
RTF4mel_mean = toybox.round_significant_digits(np.mean(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_var = toybox.round_significant_digits(np.var(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_std = toybox.round_significant_digits(np.std(RTF4mel_nparr), significant_digits=significant_digits)
print(f'RTF ---------------------------')
print(f'RTF mean: {RTF4mel_mean}')
print(f'RTF var: {RTF4mel_var}')
print(f'RTF std: {RTF4mel_std}')

# for culc utmos to infer 
print(f'utmos ---------------------------')
utmos_mean = toybox.round_significant_digits(np.mean(utmos_nparr), significant_digits=significant_digits)
utmos_var = toybox.round_significant_digits(np.var(utmos_nparr), significant_digits=significant_digits)
utmos_std = toybox.round_significant_digits(np.std(utmos_nparr), significant_digits=significant_digits)
print(f'utmos mean: {utmos_mean}')
print(f'utmos var: {utmos_var}')
print(f'utmos std: {utmos_std}')

100
dt ---------------------------
dt mean: 14.75482539
dt var: 29.438431780000002
dt std: 5.425719471
RTF ---------------------------
RTF mean: 2.221159653
RTF var: 0.02777350155
RTF std: 0.1666538375
utmos ---------------------------
utmos mean: 3.39501344
utmos var: 0.1545336726
utmos std: 0.3931077112


## [seq] analysis json

It is sequence of <span style="color: red;">json analysis section</span> from below cells

The 0th data seemed to have a slower inference time than usual, perhaps because the model took longer to load. Therefore, we try to use data from 1 to 101.

In [23]:
# for loading eval_jsonl of each models
target_model = 'gradtfkfultts'
target_device = 'cpu'
target_epoch_str = 'e500'
eval_json_dir = 'result'#'result4eval/json4mid' # or result4eval/infer4mid

eval_json_dirpath = f'{eval_json_dir}/{target_epoch_str}'
print(f'target_model: {target_model}')
print(f'target_jsonl_path: {eval_json_dirpath}')


# original eval4mid.json path: 'gradtts_cpu': 'result4eval/infer4mid/gradtts/cpu/eval4mid.json'
# diff result4eval/infer4mid/gradtts/cpu/eval4mid.json result4eval/json4mid/eval4mid_gt_k3.json
# no difference between upper files

"""midtermstyle
eval_info = {
    'lj': f'result4eval/infer4PBL/groundtruth/evalljPBL.json',
    'lj_via_hifigan': f'result4eval/infer4PBL/groundtruth/eval4PBL.json',
    'gradtts_cpu': f'{eval_json_dirpath}/eval4mid_gt_k3.json',
    'gradseptts_cpu': f'{eval_json_dirpath}/eval4mid_sgt_k3.json',
    'gradtfktts_cpu': f'{eval_json_dirpath}/eval4mid_tfk_k3.json',
    'gradtfk5tts_cpu': f'{eval_json_dirpath}/eval4mid_tfk_k5.json',
    'gradtimektts_cpu': f'{eval_json_dirpath}/eval4mid_timek_k3.json',
    'gradfreqktts_cpu': f'{eval_json_dirpath}/eval4mid_freqk_k3.json',
}

"""

# result4eval/<runtime_name>/<model_name>/<device_name>/<target_epoch>/eval.json
# or
# result4eval/json4mid/<target_epoch>/eval4mid_<model_shortword>_<kernel_size>.json

eval_info = {
    'lj': f'result4eval/infer4PBL/groundtruth/evalljPBL.json',
    'lj_via_hifigan': f'result4eval/infer4PBL/groundtruth/eval4PBL.json',
    'cpu': {
        'e500': {
            'gradtts':f'{eval_json_dirpath}/eval4mid_gt_k3.json',
            'gradseptts': f'{eval_json_dirpath}/eval4mid_sgt_k3.json',
            'gradtfktts': f'{eval_json_dirpath}/eval4mid_tfk_k3.json',
            'gradtfk5tts': f'{eval_json_dirpath}/eval4mid_tfk_k5.json',
            'gradtimektts': f'{eval_json_dirpath}/eval4mid_timek_k3.json',
            'gradfreqktts': f'{eval_json_dirpath}/eval4mid_freqk_k3.json',
        },
        'e1000':{
            
        }
    }
}

#eval_jsonl_path = Path(eval_info[eval_target]) # midtermstyle
eval_jsonl_path = Path(eval_info[target_device][target_epoch_str][target_model])


if eval_jsonl_path.exists() == True:
    print(f'Exist: {eval_jsonl_path}')
    import json
    with open(eval_jsonl_path) as f:
        eval_jsonl_list = [json.loads(l) for l in f]
else:
    print(f'No Exists: {eval_jsonl_path}')


# for loading test_dataset.json
test_dataset_json_path = Path('configs/test_dataset.json')
import json
with open(test_dataset_json_path) as f:
    test_dataset_list = [json.loads(l) for l in f]
print(f'load: {test_dataset_json_path}')

target_model: gradtts
target_jsonl_path: result4eval/json4mid/e500
Exist: result4eval/json4mid/e500/eval4mid_gt_k3.json
load: configs/test_dataset.json


In [24]:
def round_significant_digits(value, significant_digits=5):
    if value == 0:
        return 0

    import math
    scale = math.floor(-math.log10(abs(value)))  # Find the first nonzero after the decimal point
    factor = 10 ** (scale + significant_digits - 1)  # Scale to hold 5 significant digits

    rounded_value = round(value * factor,1) / factor  # Adjust and round off the scale
    return rounded_value

#input_value = 0.06238165879957992
#input_value = 0.007710418871435095

# Rounded to the nearest whole number to ensure significant figures
#result = round_significant_digits(input_value, significant_digits=5)
#print(result) 

In [25]:
# for culc phonemes length
#text = test_ds_list[i]['text']
#phonemes, phnid = text2phnid(text, phn2id, 'en')
#phonemes_len_int = len(phonemes)
#phnid_len_int = len(phnid)

#test_dataset_list[0][0]

In [26]:
# The 0th data seemed to have a slower inference time than usual, 
# perhaps because the model took longer to load.
# Therefore, we try to use data from 1 to 101

#eval_jsonl_list[0]

In [27]:
#counter_list = list(range(1,101))
#print(f'counter_list_len: {len(counter_list)}')
#print(f'counter_width: {counter_list[0]}-{counter_list[-1]}')
#print(len(dt_nparr))

In [28]:
# culc for evaluation Indicates
dt_list = [eval_jsonl_list[n]['dt'] for n in range(len(eval_jsonl_list))]
RTF4mel_list = [eval_jsonl_list[n]['RTF4mel'] for n in range(len(eval_jsonl_list))]
utmos_list = [eval_jsonl_list[n]['utmos'] for n in range(len(eval_jsonl_list))]

In [29]:
dt_nparr = np.array(dt_list[1:101])
RTF4mel_nparr = np.array(RTF4mel_list[1:101])
utmos_nparr = np.array(utmos_list[1:101])
print(len(dt_nparr))

100


In [30]:
significant_digits = 5

# for culc difference time to infer text2mel
dt_mean = round_significant_digits(np.mean(dt_nparr), significant_digits=significant_digits)
dt_var = round_significant_digits(np.var(dt_nparr), significant_digits=significant_digits)
dt_std = round_significant_digits(np.std(dt_nparr), significant_digits=significant_digits)
print(f'{target_model}: dt ---------------------------')
print(f'dt mean: {dt_mean}')
print(f'dt var: {dt_var}')
print(f'dt std: {dt_std}')

# for culc RTF4mel to infer text2mel
RTF4mel_mean = round_significant_digits(np.mean(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_var = round_significant_digits(np.var(RTF4mel_nparr), significant_digits=significant_digits)
RTF4mel_std = round_significant_digits(np.std(RTF4mel_nparr), significant_digits=significant_digits)
print(f'{target_model}: RTF ---------------------------')
print(f'RTF mean: {RTF4mel_mean}')
print(f'RTF var: {RTF4mel_var}')
print(f'RTF std: {RTF4mel_std}')

# for culc utmos to infer 
print(f'{target_model}: utmos ---------------------------')
utmos_mean = round_significant_digits(np.mean(utmos_nparr), significant_digits=significant_digits)
utmos_var = round_significant_digits(np.var(utmos_nparr), significant_digits=significant_digits)
utmos_std = round_significant_digits(np.std(utmos_nparr), significant_digits=significant_digits)
print(f'utmos mean: {utmos_mean}')
print(f'utmos var: {utmos_var}')
print(f'utmos std: {utmos_std}')

gradfreqktts: dt ---------------------------
dt mean: 14.776
dt var: 24.908
dt std: 4.9908
gradfreqktts: RTF ---------------------------
RTF mean: 2.2504
RTF var: 0.0038915
RTF std: 0.062382
gradfreqktts: utmos ---------------------------
utmos mean: 4.1955
utmos var: 0.065222
utmos std: 0.25539
