# Prepare filelists for LJSpeech dataset


In [1]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
dir_data = "/Users/henrytse/code/VitsTraining/AI_Vtuber_VoiceTraining-1/output_folder"
config = "../config.yaml"
symlink = "DUMMY3"
n_val = 100
n_test = 500

In [10]:
!export PHONEMIZER_ESPEAK_LIBRARY=/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.dylibimport os
import pathlib
print(pathlib.Path(os.environ['PHONEMIZER_ESPEAK_LIBRARY']))
# Output: PosixPath('/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib')

import ctypes
print(ctypes.cdll.LoadLibrary('/usr/local/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib'))
# Output: <CDLL '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib', handle 89a128f0 at 0x10b42ccd0>





KeyError: 'PHONEMIZER_ESPEAK_LIBRARY'

## Get hyperparameters from config file


In [2]:
import pandas as pd
from utils.hparams import get_hparams_from_file

hps = get_hparams_from_file(config)

## Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv`.


In [3]:
data = pd.read_csv(
    f"/Users/henrytse/code/VitsTraining/AI_Vtuber_VoiceTraining-1/train.csv",
    sep=r"|",
    header=None,
    names=["file", "text", "normalized_text", "cleaned_text"],
    index_col=False,
    # converter to add .wav to file name
    converters={"file": lambda x: f"{symlink}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text
0,DUMMY3/HololiveTest1.wav,"Oh, wouldn't it be ironicif IRyS sabotages the...","Oh, wouldn't it be ironicif IRyS sabotages the...",
1,DUMMY3/HololiveTest1.wav,"Wake up, IRyS!","Wake up, IRyS!",
2,DUMMY3/HololiveTest1.wav,Time to ruin Christmas!,Time to ruin Christmas!,
3,DUMMY3/HololiveTest1.wav,Let's gooo!,Let's gooo!,
4,DUMMY3/HololiveTest1.wav,"How, how is that sabotaged?","How, how is that sabotaged?",


## Text cleaners

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [4]:
# Get index of tokenize_text
text_cleaners = hps.data.text_cleaners

token_idx = text_cleaners.index("tokenize_text")
token_cleaners = text_cleaners[token_idx:]
print(token_cleaners)


# Extract phonemize_text
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = text_cleaners[:token_idx]
text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

['tokenize_text', 'add_bos_eos']
[['phonemize_text']]


In [7]:
from text import tokenizer
from torchtext.vocab import Vocab

text_norm = data["normalized_text"].tolist()
for cleaners in text_cleaners:
    print(f"Cleaning with {cleaners} ...")
    if cleaners[0] == "phonemize_text":
        text_norm = tokenizer(text_norm, Vocab, cleaners, language=hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, Vocab, cleaners, language=hps.data.language)
            text_norm[idx] = temp

data = data.assign(cleaned_text=text_norm)
data.head()

Cleaning with ['phonemize_text'] ...


RuntimeError: espeak not installed on your system

## Generate and save vocabulary


In [None]:
from torchtext.vocab import build_vocab_from_iterator
from utils.task import load_vocab, save_vocab
from text.symbols import special_symbols, UNK_ID
from typing import List


def yield_tokens(cleaned_text: List[str]):
    for text in cleaned_text:
        yield text.split()


text_norm = data["cleaned_text"].tolist()
vocab = build_vocab_from_iterator(yield_tokens(text_norm), specials=special_symbols)
vocab.set_default_index(UNK_ID)

vocab_file = f"../vocab.txt"
save_vocab(vocab, vocab_file)

vocab = load_vocab(vocab_file)
print(f"Size of vocabulary: {len(vocab)}")
print(vocab.get_itos())

AttributeError: 'float' object has no attribute 'split'

## Token cleaners


In [None]:
from text import detokenizer

text_norm = data["cleaned_text"].tolist()
for idx, text in enumerate(text_norm):
    temp = tokenizer(text, vocab, token_cleaners, language=hps.data.language)
    assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
    text_norm[idx] = temp

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(tokens=text_norm)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text,tokens
0,DUMMY1/LJ001-0001.wav,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ...","p ɹ ˈɪ n t ɪ ŋ , <space> ˈɪ n <space> ð ə <spa...",2\t19\t12\t18\t6\t7\t15\t42\t27\t4\t18\t6\t4\t...
1,DUMMY1/LJ001-0002.wav,in being comparatively modern.,in being comparatively modern.,ˈɪ n <space> b ˈiː ɪ ŋ <space> k ə m p ˈæ ɹ ə ...,2\t18\t6\t4\t25\t36\t15\t42\t4\t13\t8\t17\t19\...
2,DUMMY1/LJ001-0003.wav,For although the Chinese took impressions from...,For although the Chinese took impressions from...,f ɔːɹ <space> ɔː l ð ˈoʊ <space> ð ə <space> t...,2\t23\t59\t4\t92\t16\t11\t39\t4\t11\t8\t4\t50\...
3,DUMMY1/LJ001-0004.wav,"produced the block books, which were the immed...","produced the block books, which were the immed...",p ɹ ə d ˈuː s t <space> ð ə <space> b l ˈɑː k ...,2\t19\t12\t8\t10\t44\t9\t7\t4\t11\t8\t4\t25\t1...
4,DUMMY1/LJ001-0005.wav,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...,ð ə <space> ɪ n v ˈɛ n ʃ ə n <space> ʌ v <spac...,2\t11\t8\t4\t15\t6\t21\t22\t6\t37\t8\t6\t4\t28...


## Save train, val, test filelists


In [None]:
data = data[["file", "tokens"]]
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)