## [seq] import 

In [1]:
import json
import yaml
import sys
import time
import IPython.display as ipd
import pprint
from pathlib import Path
from tqdm import tqdm

import numpy as np
import torch
import torchaudio
from librosa.filters import mel as librosa_mel_fn
#import matplotlib
#matplotlib.use("Agg")
import matplotlib.pyplot as plt
from scipy.io.wavfile import write


import toybox

## [seq] check the configuration

## [seq] device setting

In [2]:
import os

print(f"all cpu at using device: {os.cpu_count()}")
print(f"Number of available CPU: {len(os.sched_getaffinity(0))}") # Number of available CPUs can also be obtained. ,use systemcall at linux.

all cpu at using device: 52
Number of available CPU: 4


In [3]:
DEVICE = 'cpu' # 'cuda' or 'cpu'
DEVICE_UTMOS = 'cuda'
if torch.cuda.is_available():
    print('use cuda')
else:
    os._exit(os.EX_OK)
    print('use cpu')

device = torch.device(DEVICE_UTMOS)
print(f'device: {device}')

use cuda
device: cuda


## [seq] load utmos model

In [4]:
# seed val from https://github.com/sarulab-speech/UTMOS22/blob/master/strong/configs/train/default.yaml
utmos_seed = 1234
toybox.set_seed(utmos_seed)

In [5]:
predictor_utmos = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)

Using cache found in /work/sora-sa/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


## [seq] setting path¶

In [6]:
test_ds_path = Path('configs/test_dataset.json')

LJWAV_DIR_PATH = Path('./data/ljspeech/LJSpeech-1.1/wavs/')
RESULT_DIR_PATH = Path('./result4eval/ljwav')
RESULT_JSON_PATH = RESULT_DIR_PATH / 'evallj1031.json'

In [7]:
print('-------------------------------------------')
if test_ds_path.exists():
    print(f'Exists {str(test_ds_path)}')
    with open(test_ds_path) as j:
        test_ds_list = json.load(j)
    print(f'loaded {test_ds_path}')
else:
    print(f'No exist {test_ds_path}')

print('-------------------------------------------')
if LJWAV_DIR_PATH.exists():
    print(f'Exists {LJWAV_DIR_PATH}')
else:
    print(f'No exist {LJWAV_DIR_PATH}')

print('-------------------------------------------')
if RESULT_DIR_PATH.exists():
    print(f'Exists {RESULT_DIR_PATH}')
else:
    RESULT_DIR_PATH.mkdir(parents=True)
    #print(f'No exist {RESULT_DIR_PATH}')

print('-------------------------------------------')
if RESULT_JSON_PATH.exists():
    print(f'Exists {RESULT_JSON_PATH}')
else:
    #RESULT_DIR_PATH.mkdir(parents=True)
    print(f'No exist {RESULT_JSON_PATH}')

-------------------------------------------
Exists configs/test_dataset.json
loaded configs/test_dataset.json
-------------------------------------------
Exists data/ljspeech/LJSpeech-1.1/wavs
-------------------------------------------
-------------------------------------------
No exist result4eval/ljwav/evallj1031.json


## [seq] test load wav

In [8]:
ljwav_path = LJWAV_DIR_PATH / f"{test_ds_list[0]['name']}.wav"
#iwav_path = IWAV_DIR_PATH / f"{test_ds_list[0]['name']}.wav"
print(ljwav_path)
ljwav, ljsamplerate = torchaudio.load(ljwav_path)
ipd.display(ipd.Audio(ljwav, rate=ljsamplerate))

data/ljspeech/LJSpeech-1.1/wavs/LJ045-0049.wav


## [seq] test eval

In [9]:
# wav is [1, time]
score = predictor_utmos(ljwav, ljsamplerate)

In [10]:
print(score)
print(type(score))
print(score.item())
print(type(score.item()))

tensor([4.4567], grad_fn=<AddBackward0>)
<class 'torch.Tensor'>
4.45673942565918
<class 'float'>


## [seq] save LJ eval_json

In [11]:
infer_data_num: int = 100 #len(test_ds_list) is 200
print(infer_data_num)

100


In [12]:
print(RESULT_JSON_PATH)

result4eval/ljwav/evallj1031.json


In [13]:
score4lj_utmos_list = []
for i in tqdm(range(infer_data_num)):
    ljfilename = test_ds_list[i]['name']
    ljwav_path = Path('./data/ljspeech/LJSpeech-1.1/wavs') / f"{ljfilename}.wav"
    print(ljwav_path)
    ljwav, ljsamplerate = torchaudio.load(ljwav_path)
    score4lj_utmos = predictor_utmos(ljwav, ljsamplerate)
    score_float = score4lj_utmos.item()
    evallj_dict = {'name': ljfilename, 'path': str(ljwav_path), 'utmos': score_float}
    score4lj_utmos_list.append(evallj_dict)
        

  0%|                                                                                           | 0/100 [00:00<?, ?it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ045-0049.wav


  2%|█▋                                                                                 | 2/100 [00:00<00:28,  3.48it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ017-0027.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ023-0031.wav


  3%|██▍                                                                                | 3/100 [00:00<00:21,  4.47it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ032-0046.wav


  4%|███▎                                                                               | 4/100 [00:01<00:23,  4.06it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ030-0026.wav


  5%|████▏                                                                              | 5/100 [00:01<00:31,  3.06it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ020-0041.wav


  6%|████▉                                                                              | 6/100 [00:01<00:34,  2.70it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ001-0070.wav


  7%|█████▊                                                                             | 7/100 [00:02<00:36,  2.57it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ019-0334.wav


  9%|███████▍                                                                           | 9/100 [00:02<00:26,  3.39it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ022-0152.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ050-0154.wav


 10%|████████▏                                                                         | 10/100 [00:03<00:26,  3.41it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ016-0045.wav


 11%|█████████                                                                         | 11/100 [00:03<00:31,  2.84it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ036-0100.wav


 12%|█████████▊                                                                        | 12/100 [00:03<00:29,  2.98it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ046-0016.wav


 13%|██████████▋                                                                       | 13/100 [00:04<00:27,  3.11it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ048-0085.wav


 15%|████████████▎                                                                     | 15/100 [00:04<00:24,  3.41it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ050-0197.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ050-0178.wav


 16%|█████████████                                                                     | 16/100 [00:05<00:23,  3.57it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ043-0079.wav


 17%|█████████████▉                                                                    | 17/100 [00:05<00:29,  2.79it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ050-0207.wav


 18%|██████████████▊                                                                   | 18/100 [00:05<00:27,  2.99it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ034-0005.wav


 19%|███████████████▌                                                                  | 19/100 [00:06<00:30,  2.67it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ031-0151.wav


 20%|████████████████▍                                                                 | 20/100 [00:06<00:30,  2.65it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ023-0021.wav


 21%|█████████████████▏                                                                | 21/100 [00:06<00:27,  2.83it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ015-0301.wav


 23%|██████████████████▊                                                               | 23/100 [00:07<00:22,  3.35it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ021-0153.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ014-0037.wav


 24%|███████████████████▋                                                              | 24/100 [00:07<00:20,  3.69it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ004-0200.wav


 26%|█████████████████████▎                                                            | 26/100 [00:08<00:21,  3.40it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ049-0010.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ008-0291.wav


 27%|██████████████████████▏                                                           | 27/100 [00:08<00:18,  3.95it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ048-0221.wav


 28%|██████████████████████▉                                                           | 28/100 [00:08<00:17,  4.06it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ004-0157.wav


 29%|███████████████████████▊                                                          | 29/100 [00:09<00:17,  4.07it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ013-0175.wav


 30%|████████████████████████▌                                                         | 30/100 [00:09<00:18,  3.70it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ021-0100.wav


 31%|█████████████████████████▍                                                        | 31/100 [00:09<00:19,  3.58it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0132.wav


 32%|██████████████████████████▏                                                       | 32/100 [00:09<00:19,  3.41it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ023-0059.wav


 33%|███████████████████████████                                                       | 33/100 [00:10<00:24,  2.75it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ003-0027.wav


 34%|███████████████████████████▉                                                      | 34/100 [00:10<00:24,  2.72it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0133.wav


 35%|████████████████████████████▋                                                     | 35/100 [00:11<00:22,  2.94it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ033-0060.wav


 36%|█████████████████████████████▌                                                    | 36/100 [00:11<00:21,  3.04it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ003-0299.wav


 37%|██████████████████████████████▎                                                   | 37/100 [00:11<00:20,  3.00it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ011-0060.wav


 38%|███████████████████████████████▏                                                  | 38/100 [00:12<00:21,  2.83it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ013-0240.wav


 39%|███████████████████████████████▉                                                  | 39/100 [00:12<00:20,  2.92it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ047-0076.wav


 40%|████████████████████████████████▊                                                 | 40/100 [00:13<00:24,  2.46it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ041-0133.wav


 41%|█████████████████████████████████▌                                                | 41/100 [00:13<00:24,  2.36it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ038-0264.wav


 42%|██████████████████████████████████▍                                               | 42/100 [00:14<00:25,  2.29it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ011-0016.wav


 43%|███████████████████████████████████▎                                              | 43/100 [00:14<00:21,  2.70it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ003-0185.wav


 44%|████████████████████████████████████                                              | 44/100 [00:14<00:22,  2.51it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ014-0063.wav


 45%|████████████████████████████████████▉                                             | 45/100 [00:15<00:20,  2.71it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ005-0185.wav


 46%|█████████████████████████████████████▋                                            | 46/100 [00:15<00:17,  3.01it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ014-0135.wav


 47%|██████████████████████████████████████▌                                           | 47/100 [00:15<00:16,  3.15it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ009-0046.wav


 48%|███████████████████████████████████████▎                                          | 48/100 [00:15<00:18,  2.83it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ037-0024.wav


 49%|████████████████████████████████████████▏                                         | 49/100 [00:16<00:20,  2.45it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ002-0217.wav


 50%|█████████████████████████████████████████                                         | 50/100 [00:16<00:20,  2.48it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ044-0017.wav


 51%|█████████████████████████████████████████▊                                        | 51/100 [00:17<00:18,  2.62it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ017-0074.wav


 52%|██████████████████████████████████████████▋                                       | 52/100 [00:17<00:16,  2.92it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ033-0153.wav


 53%|███████████████████████████████████████████▍                                      | 53/100 [00:17<00:15,  3.08it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ032-0124.wav


 54%|████████████████████████████████████████████▎                                     | 54/100 [00:18<00:16,  2.77it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0287.wav


 55%|█████████████████████████████████████████████                                     | 55/100 [00:18<00:14,  3.03it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ020-0038.wav


 56%|█████████████████████████████████████████████▉                                    | 56/100 [00:18<00:13,  3.29it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ001-0007.wav


 57%|██████████████████████████████████████████████▋                                   | 57/100 [00:19<00:15,  2.86it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ003-0313.wav


 58%|███████████████████████████████████████████████▌                                  | 58/100 [00:19<00:16,  2.60it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ019-0265.wav


 59%|████████████████████████████████████████████████▍                                 | 59/100 [00:19<00:14,  2.84it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ038-0281.wav


 60%|█████████████████████████████████████████████████▏                                | 60/100 [00:20<00:14,  2.67it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ045-0235.wav


 62%|██████████████████████████████████████████████████▊                               | 62/100 [00:20<00:11,  3.17it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ038-0255.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ028-0205.wav


 63%|███████████████████████████████████████████████████▋                              | 63/100 [00:21<00:12,  2.93it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ014-0260.wav


 64%|████████████████████████████████████████████████████▍                             | 64/100 [00:21<00:13,  2.70it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ033-0166.wav


 65%|█████████████████████████████████████████████████████▎                            | 65/100 [00:22<00:12,  2.71it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ037-0125.wav


 66%|██████████████████████████████████████████████████████                            | 66/100 [00:22<00:12,  2.66it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ013-0142.wav


 67%|██████████████████████████████████████████████████████▉                           | 67/100 [00:22<00:11,  2.81it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ031-0199.wav


 68%|███████████████████████████████████████████████████████▊                          | 68/100 [00:23<00:12,  2.52it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ004-0017.wav


 69%|████████████████████████████████████████████████████████▌                         | 69/100 [00:23<00:11,  2.76it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ024-0115.wav


 70%|█████████████████████████████████████████████████████████▍                        | 70/100 [00:24<00:12,  2.43it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ017-0171.wav


 72%|███████████████████████████████████████████████████████████                       | 72/100 [00:24<00:08,  3.22it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ017-0040.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ005-0044.wav


 74%|████████████████████████████████████████████████████████████▋                     | 74/100 [00:24<00:06,  3.97it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ007-0169.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ015-0153.wav


 75%|█████████████████████████████████████████████████████████████▌                    | 75/100 [00:25<00:07,  3.19it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ045-0043.wav


 76%|██████████████████████████████████████████████████████████████▎                   | 76/100 [00:25<00:08,  2.72it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ050-0010.wav


 77%|███████████████████████████████████████████████████████████████▏                  | 77/100 [00:26<00:08,  2.73it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ006-0126.wav


 78%|███████████████████████████████████████████████████████████████▉                  | 78/100 [00:26<00:07,  2.76it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0356.wav


 79%|████████████████████████████████████████████████████████████████▊                 | 79/100 [00:27<00:07,  2.64it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ040-0223.wav


 80%|█████████████████████████████████████████████████████████████████▌                | 80/100 [00:27<00:07,  2.57it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ008-0281.wav


 81%|██████████████████████████████████████████████████████████████████▍               | 81/100 [00:27<00:06,  2.83it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ008-0222.wav


 82%|███████████████████████████████████████████████████████████████████▏              | 82/100 [00:27<00:05,  3.15it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ046-0123.wav


 83%|████████████████████████████████████████████████████████████████████              | 83/100 [00:28<00:06,  2.80it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ030-0044.wav


 84%|████████████████████████████████████████████████████████████████████▉             | 84/100 [00:28<00:06,  2.57it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0051.wav


 85%|█████████████████████████████████████████████████████████████████████▋            | 85/100 [00:29<00:05,  2.59it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ042-0231.wav


 86%|██████████████████████████████████████████████████████████████████████▌           | 86/100 [00:29<00:04,  2.91it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ011-0121.wav


 87%|███████████████████████████████████████████████████████████████████████▎          | 87/100 [00:29<00:05,  2.53it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ016-0186.wav


 88%|████████████████████████████████████████████████████████████████████████▏         | 88/100 [00:30<00:04,  2.65it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ011-0164.wav


 89%|████████████████████████████████████████████████████████████████████████▉         | 89/100 [00:30<00:04,  2.53it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ026-0039.wav


 90%|█████████████████████████████████████████████████████████████████████████▊        | 90/100 [00:31<00:04,  2.38it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ003-0105.wav


 91%|██████████████████████████████████████████████████████████████████████████▌       | 91/100 [00:31<00:03,  2.38it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ039-0104.wav


 92%|███████████████████████████████████████████████████████████████████████████▍      | 92/100 [00:31<00:02,  2.79it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ002-0038.wav


 93%|████████████████████████████████████████████████████████████████████████████▎     | 93/100 [00:32<00:02,  2.85it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ046-0194.wav


 94%|█████████████████████████████████████████████████████████████████████████████     | 94/100 [00:32<00:02,  2.55it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ008-0115.wav


 96%|██████████████████████████████████████████████████████████████████████████████▋   | 96/100 [00:33<00:01,  3.11it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ016-0104.wav
data/ljspeech/LJSpeech-1.1/wavs/LJ019-0301.wav


 97%|███████████████████████████████████████████████████████████████████████████████▌  | 97/100 [00:33<00:01,  2.85it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ028-0012.wav


 98%|████████████████████████████████████████████████████████████████████████████████▎ | 98/100 [00:34<00:00,  2.52it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ018-0059.wav


 99%|█████████████████████████████████████████████████████████████████████████████████▏| 99/100 [00:34<00:00,  2.39it/s]

data/ljspeech/LJSpeech-1.1/wavs/LJ029-0081.wav


100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [00:35<00:00,  2.84it/s]


In [14]:
if RESULT_JSON_PATH.exists() == False:
    with open(RESULT_JSON_PATH, 'w') as f:
        for entry in score4lj_utmos_list:
            f.write(json.dumps(entry) + '\n')
    print(f'Make {RESULT_JSON_PATH}')
else:
    print(f'Already Exists {RESULT_JSON_PATH}')

Make result4eval/ljwav/evallj1031.json


## [seq] eval wav

In [23]:
eval_target = 'lj_via_hifigan'

eval_info = {
    'lj_1013': './result4eval/ljwav/evallj1031.json',
    'lj_pbl': 'result4eval/infer4PBL/groundtruth/evalljPBL.json',
    'lj_via_hifigan': 'result4eval/infer4PBL/groundtruth/eval4PBL.json'
}

eval_jsonl_path = Path(eval_info[eval_target])
if eval_jsonl_path.exists() == True:
    print(f'Exist {eval_jsonl_path}')
    import json
    with open(eval_jsonl_path) as f:
        eval_jsonl_list = [json.loads(l) for l in f]
else:
    print(f'No Exists {eval_jsonl_path}')

Exist result4eval/infer4PBL/groundtruth/eval4PBL.json


In [24]:
utmos_list = [eval_jsonl_list[n]['utmos'] for n in range(len(eval_jsonl_list))]
utmos_nparr = np.array(utmos_list)

In [25]:
utmos_mean = np.mean(utmos_nparr)
utmos_var = np.var(utmos_nparr)
utmos_std = np.std(utmos_nparr)
print(f'utmos mean: {utmos_mean}')
print(f'utmos var: {utmos_var}')
print(f'utmos std: {utmos_std}')

utmos mean: 4.019014093875885
utmos var: 0.09124055665879827
utmos std: 0.30206051820586927
