In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

%load_ext autoreload
%autoreload 2

PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
DATA_DIR = (PWD / '../../datasets/CSS10/hungarian/').resolve()
device = 'cuda'

In [2]:
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(PWD)



It seems weight norm is not applied in the pretrained model but the current model uses it. To keep the compatibility, we remove the norm from the current model. This may cause unexpected behavior due to the parameter mismatch in finetuning. To avoid this issue, please change the following parameters in config to false:
 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249


In [3]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)

In [158]:
from en_to_hu import phonemize, G2P

In [4]:
from phonemizer.backend import BACKENDS
from phonemizer.separator import Separator
G2P = BACKENDS['espeak'](
    language='hu',
    preserve_punctuation=True,
    with_stress=True,
)
SEP = Separator(word='|', phone=' ')
def phonemize_base(text):
    phonestr = G2P.phonemize([text], separator=SEP)[0]
    phones = phonestr.replace('|', ' | ').strip().split()
    return phones

In [8]:
G2P.phonemize(['De azért akik megszokták, értették a beszédét.', 'Folytatta: - és kedves Juliska!'], separator=SEP)



['d ˌɛ | ˈɑ z eː r t | ˈɑ k i k |m ˈɛ ɡ s o k t aː k |,  ˈeː r t ɛ tː eː k | ˌɑ |b ˈɛ s eː d eː t |.', 'f ˈo j t ɑ tː ɑ |:  ˌeː ʃ |k ˈɛ d v ɛ ʃ |j ˈu l i ʃ k ɑ |!']


['d ˌɛ | ˈɑ z eː r t | ˈɑ k i k |m ˈɛ ɡ s o k t aː k |,  ˈeː r t ɛ tː eː k | ˌɑ |b ˈɛ s eː d eː t |.',
 'f ˈo j t ɑ tː ɑ |:  ˌeː ʃ |k ˈɛ d v ɛ ʃ |j ˈu l i ʃ k ɑ |!']

In [None]:
phoneset = set()
# Please correct invalid characters
with open(DATA_DIR / 'transcript_phones_espeak.txt', 'w') as phones_f:
    with open(DATA_DIR / 'transcript.txt') as f:
        for line in f:
            filename, _, text2, _ = line.split('|')
            if not text2:
                continue
            # G2P.logger.warning(f'{filename}')
            # phones = phonemize(text2)
            phones = phonemize(text2)
            phones_f.write(f'{filename}|{" ".join(phones)}\n')
            phoneset |= set(phones)
            # break

In [10]:
# from en_to_hu import PHONES_MAP, PUNCS
# phoneset = set(PHONES_MAP.keys()) | set(PUNCS)
phoneset = set()
with open(DATA_DIR / 'transcript_phones_espeak.txt') as f:
    for line in f:
        filename, trans = line.split('|', maxsplit=1)
        for phone in trans.strip().split():
            if phone not in phoneset:
                phoneset.add(phone)

In [160]:
' '.join(sorted(phoneset))

'! , . ? aː b bː c cː d dzː dʒ dː eː f h i iː j k kː l m n o oː p pː r s t ts tsː tʃ tʃː tː u uː v y yː z | ø øː ɑ ɛ ɟ ɟː ɡ ɡː ɲ ʃ ʎ ʒ ˈaː ˈeː ˈi ˈiː ˈo ˈoː ˈu ˈuː ˈy ˈyː ˈø ˈøː ˈɑ ˈɑː ˈɛ ˌaː ˌeː ˌi ˌiː ˌo ˌoː ˌu ˌuː ˌy ˌyː ˌø ˌøː ˌɑ ˌɛ'

In [13]:
from en_to_hu import VOWELS_MAP, CONSONANTS_MAP
print(' '.join(x for x in VOWELS_MAP.keys() if 'ˈ' not in x and 'ˌ' not in x))
print(' '.join(CONSONANTS_MAP.keys()))

aː ɑ eː ɛ i iː o oː ø øː u uː y yː
b bː c cː d dː dzː dʒ f h j k kː l m n p pː r s t tː ts tsː tʃ tʃː v z ɟ ɟː ɡ ɡː ɲ ʃ ʎ ʒ


In [5]:
import pandas as pd
phone_freq_csv = DATA_DIR / 'phone_freq.csv'
phones = 'aː ɑ ɑː eː ɛ i iː o oː ø øː u uː y yː b bː c cː d dː dzː dʒ f h j k kː l m n p pː r s t tː ts tsː tʃ tʃː v z ɟ ɟː ɡ ɡː ɲ ʃ ʎ ʒ'.split()
if phone_freq_csv.exists():
    df = pd.read_csv(phone_freq_csv, index_col='filename')
else:
    filenames = []
    transcripts = []
    with open(DATA_DIR / 'transcript_phones_espeak.txt') as f:
        for line in f:
            filename, trans = line.split('|', maxsplit=1)
            trans = trans.strip() + ' '
            filenames.append(filename)
            transcripts.append(trans)
    df = pd.DataFrame({'filename': filenames, 'transcript': transcripts})
    df.set_index('filename', inplace=True)
    for phone in phones:
        df[phone] = df.transcript.str.count(phone + ' ')
    df.to_csv(phone_freq_csv)


In [6]:
def get_top_transcripts(phone, top_n=2):
    return df.sort_values(by=phone, ascending=False).head(top_n)['transcript'].to_dict()

In [7]:
from en_to_hu import HungarianArpaSpeech, phonemize
has = HungarianArpaSpeech(token_id_converter=id_converter, tts_inference_fn=pretrained_model.tts.generator.inference)

In [8]:

# phones_map = {'a': 'AH2',
#     'ˌa': ('AH1', '1', '1'),
#     'ˈa': ('AH1', '1', '2'),}
def gen_phones_audio(phone, top_n=2, print_phones=True):
    top_trans = get_top_transcripts(phone, top_n)
    if print_phones:
        print(phone)
        print(top_trans)
    for fname, phones_str in top_trans.items():
        hungarian = phones_str.strip().split()
        inputs = has.gen_audio(
            hungarian,
            save_dir=PWD / f'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/hungarian/{phone}',
            verbose=False,
            # phones_map=phones_map,
            # verbose=True,
            custom_filename=fname.split('/')[1],
            device=device,
        )
    return inputs

In [341]:
inputs = has.gen_inputs('ˈɛ ɟ | ˈy ɟ ɛ ʃ | l ˈɛ ɡ eː ɲ | h ˈɑ n dʒ aː r t |'.split(), verbose=True)

   -1      0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17    18    19    20    21    22    23    24
 arpas    EH1   G   HH    ,   UH1   Y    G    Y   EH0  SH    L    EH1    G    EY0    N     Y     ,     HH   AA1    N     D     ZH   AH2    R     T
d_factor   1   0.4  0.4   0    0    1   0.7   0    1    1    1     1    0.7    1    0.8   0.2    0     1    0.5    1    0.7   0.3    1     1    0.7
p_factor   0    0    0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
e_factor   0    0    0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0


In [120]:
inputs = has.gen_audio('. hej help mi', save_dir=PWD / f'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/hungarian/')

In [9]:
inputs = gen_phones_audio(phone='u', top_n=3)

u
{'egri_csillagok/egri_csillagok_2932.wav': 'k ˈɑ r ɟ u k | d ˈɑ m ɑ s k u s b ɑ n | k ˌeː s y l t | , v ˈeː r c y k | d ˈɛ r b ɛ n d i | ˈɑ ts eː l | , l ˈaː n dʒ aː j u k | h ˈi n d o s t aː n i | m ˈɛ ʃ t ɛ r k o v aː tʃ o k t oː l | v ˌɑ l oː | , ˈaː ɟ uː i k ɑ t | ˈɛ u r oː p ɑ | l ˈɛ ɡ j o bː | ˈø n t øː i | ˈɑ l k o cː aː k | m ˌɛ ɡ | , p ˈu ʃ k ɑ p o r u k | , ɡ ˈo j oː j u k | , f ˈɛ ɟ v ɛ r y k | m ˈeː r h ɛ t ɛ t l ɛ n | ˌeː ʃ | m ˈɛ ɡ s aː m l aː l h ɑ t ɑ t l ɑ n | . ', 'egri_csillagok/egri_csillagok_4228.wav': 'ˌɑ | s ˈɛ n tʃ eː ɡ b ɛ n | , ˈɑ m i t | ˈi tː | l ˈaː t u n k | , t ˈu ɟː u k | , h ˌo ɟ ˌɑ z | ˈeː l øː | j ˈeː z u ʃ | v ˈɑ n | j ˈɛ l ɛ n | . v ˈɛ l y n k | v ˌɑ n | ! b ˈo r u ʎ j u n k | l ˌɛ | , ˌeː ʃ | ˈi m aː d k o z z u n k | ! ', 'egri_csillagok/egri_csillagok_2315.wav': 't ˈu d o d | , ˈu r ɑ m | , s ˈɛ ɡ eː ɲ ɛ k | v ˌɑ ɟ u n k | , h ˈaː t | ˈɛ ʃ t eː n k i n t | h ˈɑ l aː s n u n k | k ˌɛ l l | . h ˈɑ n ɛ m | ˌɑ z | ˈeː ʎ j ɛ l | n ˈɛ m tʃ ɑ k | h ˈɑ

In [164]:
for phone in phones:
    _ = gen_phones_audio(phone, top_n=3, print_phones=False)

In [34]:
hungarian = 'creíais oíais'
custom_filename = hungarian + '.wav' if isinstance(hungarian, str) else ''.join(hungarian) + '.wav'
arpas, d_factor, p_factor, e_factor = has.gen_inputs(hungarian)
# d_factor[0, 1] *= 1.5
custom_arpa_subs = {}
# for i in range(3):
#     custom_arpa_subs[f'N UW{i}'] = (f'N N UW{i}', '0 0.7 0', '0 =0 =1', '0 =0 =1')
#     custom_arpa_subs[f'T UW{i}'] = (f'T W W', '=0 1 =1', '=0 0 =1', '=0 0 =1')
inputs = has.gen_audio(
    hungarian,
    # inputs=(arpas, d_factor, p_factor, e_factor),
    save_dir=PWD / f'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/hungarian',
    # verbose=False,
    custom_arpa_subs=custom_arpa_subs,
    verbose=True,
    custom_filename=custom_filename,
    device=device,
)

Updated 
   -1      0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16
 arpas     K    R   EH0   Y   IY1   Y   AA0   Y    S    ,   OW0    Y    IY1    Y    AA0    Y     S
d_factor  0.7   1    1   0.4  0.7  0.4   1   0.4   1    0    1    0.4   0.7   0.4    1    0.4    1
p_factor   0    0    0    0    1    0    0    0    0    0    0     0     1     0     0     0     0
e_factor   0    0   0.5   0   0.5   0   0.5   0    0    0   0.5    0    0.5    0    0.5    0     0
Duration pred: tensor([ 5.9971,  6.2804,  8.7144,  7.8999,  9.0675,  8.4928,  9.2692, 10.6716,
        10.3342, 20.5245, 12.5303,  9.2557,  8.4650,  9.1517,  9.3981, 11.4364,
        19.4551], device='cuda:0')
Pitch pred: tensor([ 0.9911,  1.0531,  0.6471,  0.3290,  0.2310, -0.4055, -0.8396, -0.4751,
        -0.2946, -0.6380, -0.4088,  0.2300,  0.6042, -0.2134, -0.9145, -0.6451,
        -0.4752], device='cuda:0')
Energy pred: tensor([-0.8592,  1.2776,  0.6716,  0.2274,  0.3890,  0.3921,  0.

In [93]:
from tqdm import tqdm
# Now run on entire dataset
save_dir = PWD / 'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/CSS10/hungarian'
transcript_file = (DATA_DIR / 'transcript_phones_espeak.txt').resolve()
with open(transcript_file) as f:
    for line in tqdm(f.read().splitlines()):
        custom_filename, phones = line.strip().split('|', maxsplit=1)
        if (save_dir / custom_filename).exists():
            continue
        hungarian = phones.split()
        
        inputs = has.gen_audio(
            hungarian,
            save_dir=save_dir,
            verbose=False,
            custom_filename=custom_filename,
            # verbose=True,
            device=device,
        )
        # arpas, d_factor, e_factor = inputs


100%|██████████| 4514/4514 [19:09<00:00,  3.93it/s]
