In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

%load_ext autoreload
%autoreload 2

PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
DATA_DIR = (PWD / '../../datasets/CSS10/spanish/').resolve()
device = 'cuda'

In [2]:
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(PWD)



It seems weight norm is not applied in the pretrained model but the current model uses it. To keep the compatibility, we remove the norm from the current model. This may cause unexpected behavior due to the parameter mismatch in finetuning. To avoid this issue, please change the following parameters in config to false:
 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249


In [3]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)

In [4]:
from en_to_es import phonemize, G2P

In [6]:
from phonemizer.backend import BACKENDS
from phonemizer.separator import Separator
G2P = BACKENDS['espeak'](
    language='es',
    preserve_punctuation=True,
    with_stress=True,
)
SEP = Separator(word='|', phone=' ')
def phonemize_base(text):
    phonestr = G2P.phonemize([text], separator=SEP)[0]
    phones = phonestr.replace('|', ' | ').strip().split()
    return phones

In [49]:
phoneset = set()
# Please correct invalid characters
with open(DATA_DIR / 'transcript_phones_espeak.txt', 'w') as phones_f:
    with open(DATA_DIR / 'transcript.txt') as f:
        for line in f:
            filename, _, text2, _ = line.split('|')
            if not text2:
                continue
            # G2P.logger.warning(f'{filename}')
            phones = phonemize(text2)
            phones_f.write(f'{filename}|{" ".join(phones)}\n')
            phoneset |= set(phones)
            # break

In [20]:
# from en_to_es import PHONES_MAP, PUNCS
# phoneset = set(PHONES_MAP.keys()) | set(PUNCS)
phoneset = set()
with open(DATA_DIR / 'transcript_phones_espeak.txt') as f:
    for line in f:
        filename, trans = line.split('|', maxsplit=1)
        for phone in trans.strip().split():
            if phone not in phoneset:
                phoneset.add(phone)

In [21]:
' '.join(sorted(phoneset))

'! , . ? a aɪ aʊ b d dʒ e eɪ eʊ f i j k l m n o oɪ p pː r s t tʃ u w x | ð ŋ ɡ ɣ ɲ ɾ ʎ ʝ ˈa ˈaɪ ˈaʊ ˈe ˈeɪ ˈeʊ ˈi ˈo ˈoɪ ˈu ˈɛ ˌa ˌaɪ ˌaʊ ˌe ˌeɪ ˌeʊ ˌi ˌo ˌoɪ ˌu β θ'

In [150]:
from en_to_es import VOWELS_MAP
print('|'.join(VOWELS_MAP.keys()))

a|ˌa|ˈa|aɪ|ˌaɪ|ˈaɪ|aʊ|ˌaʊ|ˈaʊ|e|ˌe|ˈe|ˈɛ|eɪ|ˌeɪ|ˈeɪ|eʊ|ˌeʊ|ˈeʊ|i|ˌi|ˈi|o|ˌo|ˈo|oɪ|ˌoɪ|ˈoɪ|u|ˌu|ˈu


In [5]:
import pandas as pd
phone_freq_csv = DATA_DIR / 'phone_freq.csv'
phones = 'a aɪ aʊ b β d ð e eɪ eʊ ɛ f ɡ ɣ i j ʝ k l ʎ m n ɲ ŋ o oɪ p pː r ɾ s t tʃ θ u w x'.split()
if phone_freq_csv.exists():
    df = pd.read_csv(phone_freq_csv, index_col='filename')
else:
    filenames = []
    transcripts = []
    with open(DATA_DIR / 'transcript_phones_espeak.txt') as f:
        for line in f:
            filename, trans = line.split('|', maxsplit=1)
            trans = trans.strip() + ' '
            filenames.append(filename)
            transcripts.append(trans)
    df = pd.DataFrame({'filename': filenames, 'transcript': transcripts})
    df.set_index('filename', inplace=True)
    for phone in phones:
        df[phone] = df.transcript.str.count(phone + ' ')
    df.to_csv(phone_freq_csv)


In [6]:
def get_top_transcripts(phone, top_n=2):
    return df.sort_values(by=phone, ascending=False).head(top_n)['transcript'].to_dict()

In [7]:
from en_to_es import SpanishArpaSpeech, phonemize
sas = SpanishArpaSpeech(token_id_converter=id_converter, tts_inference_fn=pretrained_model.tts.generator.inference)

ɛ
{'bailen/bailen_0712.wav': 'ˌo β e ð ˈe θ e n | a | l a | ˌa l e ɣ ɾ ˈi a | ð e | ˈu n | m o m ˈɛ n t o | , a | l a | p ˈe n a | ð e | ˈo t ɾ o | m o m ˈɛ n t o | , a | l a s | ˌa ŋ ɡ u s t j ˈo s a s | ˌa l t e ɾ n a t ˈi β a s | k e | e n | e l | ð i s k ˈu ɾ s o | ð e | k w ˈa n t a s | ˈo ɾ a s | k o n s j ˈɛ n t e | i | ð i s p ˈo n e | ð j ˈo s | , ˌe s p e k t a ð ˈo ɾ | n ˈo | ˌi n d i f e ɾ ˈɛ n t e | ð e | ˈe s t a s | β ˌa ɾ β a ɾ i ð ˈa ð e s | ð e | l o s | ˈo m b ɾ e s | . ', 'batalla_arapiles/batalla_arapiles_3930.wav': 'n ˈo | x u θ ɣ ˈe i s | m i | ˌa t ɾ e β i m j ˈɛ n t o | k o n | k ɾ i t ˈe ɾ j o | β u l ɣ ˈa ɾ | , k ɾ e j j ˈɛ n d o | k e | n ˈo | f ˈa l t o | a l | ð e k ˈo ɾ o | , a | l a s | k ˌo m b e n j ˈɛ n θ j a s | i | a l | p u ð ˈo ɾ | ð i θ j ˈɛ n d o | a | ˈu n | ˈo m b ɾ e | k e | l e | ˈa m o | . '}
f
{'bailen/bailen_0362.wav': 'm i ɾ ˈa β a m o s | k o n | d ˌe s ð e ɲ ˈo s a | ˌi n d i f e ɾ ˈɛ n θ j a | a | l o s | k e | k e ð ˈa ɾ o n | d e | 

In [8]:

# phones_map = {'a': 'AH2',
#     'ˌa': ('AH1', '1', '1'),
#     'ˈa': ('AH1', '1', '2'),}
def gen_phones_audio(phone, top_n=2, print_phones=True):
    top_trans = get_top_transcripts(phone, top_n)
    if print_phones:
        print(phone)
        print(top_trans)
    for fname, phones_str in top_trans.items():
        spanish = phones_str.strip().split()
        inputs = sas.gen_audio(
            spanish,
            save_dir=PWD / f'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/spanish/{phone}',
            verbose=False,
            # phones_map=phones_map,
            # verbose=True,
            custom_filename=fname.split('/')[1],
            device=device,
        )
    return inputs

In [21]:
inputs = gen_phones_audio(phone='aɪ', top_n=4)

aɪ
{'batalla_arapiles/batalla_arapiles_4347.wav': 'l o | s ˈe | . . . k ɾ e ˈi aɪ s | r ˌe β a x ˈa ɾ o s | s ˈo l o | ˌo k u p ˈa n d o o s | ð e l | a s ˈu n t o | . . . l o | θ j ˈe ɾ t o | ˈe s | k e | o ˈi aɪ s | t ˈo ð o | , i | k a ʎ ˈa β aɪ s | . ', 'bailen/bailen_0246.wav': 'p w ˈe s | s i | n ˈo | ˈaɪ | ˌe n e m ˈi ɣ o s | e m | b aɪ l ˈe n | , k ˈe | ˈe s | ˈe s o | ð e | ˌa t a k ˈa ɾ | a | β aɪ l ˈe n | ? ', 'batalla_arapiles/batalla_arapiles_2254.wav': 's e ɲ ˈo ɾ a | , e s t ˈa m o s | p e ɾ ð ˈi ð o s | ! n ˈo | k o n t ˈa β a m o s | k o n | l a | t ɾ aɪ θ j ˈo n | . l a | t ɾ aɪ θ j ˈo n | ! d ˈi x o | k o m f ˈu s a | m ˈi s s | ˌe f e ˌe l e ˌi ɣ ɾ i ˈe ɣ a | . n ˈo | p w ˈe ð e | s ˈe r | . ', 'batalla_arapiles/batalla_arapiles_3910.wav': 'ˈe ɾ aɪ s | e l | θ ˈi d | , b e ɾ n ˈa ɾ ð o | ð e l | k ˈa ɾ p j o | , θ ˈaɪ ð e | , ˌa β e n a m ˈa ɾ | , θ e l ˈi n d o s | , l ˌa n θ a ɾ ˈo t e | ð e l | l ˈa ɣ o | , f e ɾ n ˈa ŋ | ɡ o n θ ˈa l e θ | i | p ˈe ð ɾ o | a n s

In [23]:
for phone in phones:
    _ = gen_phones_audio(phone, top_n=4, print_phones=False)

In [35]:
sas.gen_inputs('creíais oíais', verbose=True)

   -1      0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16
 arpas     K    R   EH0   Y   IY1   Y   AA0   Y    S    ,   OW0    Y    IY1    Y    AA0    Y     S
d_factor  0.7   1    1   0.4  0.7  0.4   1   0.4   1    0    1    0.4   0.7   0.4    1    0.4    1
p_factor   0    0    0    0    1    0    0    0    0    0    0     0     1     0     0     0     0
e_factor   0    0   0.5   0   0.5   0   0.5   0    0    0   0.5    0    0.5    0    0.5    0     0


(array(['K', 'R', 'EH0', 'Y', 'IY1', 'Y', 'AA0', 'Y', 'S', ',', 'OW0', 'Y',
        'IY1', 'Y', 'AA0', 'Y', 'S'], dtype='<U3'),
 tensor([[0.7000, 1.0000, 1.0000, 0.4000, 0.7000, 0.4000, 1.0000, 0.4000, 1.0000,
          0.0000, 1.0000, 0.4000, 0.7000, 0.4000, 1.0000, 0.4000, 1.0000]],
        device='cuda:0'),
 tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]],
        device='cuda:0'),
 tensor([[0.0000, 0.0000, 0.5000, 0.0000, 0.5000, 0.0000, 0.5000, 0.0000, 0.0000,
          0.0000, 0.5000, 0.0000, 0.5000, 0.0000, 0.5000, 0.0000, 0.0000]],
        device='cuda:0'))

In [34]:
spanish = 'creíais oíais'
custom_filename = spanish + '.wav' if isinstance(spanish, str) else ''.join(spanish) + '.wav'
arpas, d_factor, p_factor, e_factor = sas.gen_inputs(spanish)
# d_factor[0, 1] *= 1.5
custom_arpa_subs = {}
# for i in range(3):
#     custom_arpa_subs[f'N UW{i}'] = (f'N N UW{i}', '0 0.7 0', '0 =0 =1', '0 =0 =1')
#     custom_arpa_subs[f'T UW{i}'] = (f'T W W', '=0 1 =1', '=0 0 =1', '=0 0 =1')
inputs = sas.gen_audio(
    spanish,
    # inputs=(arpas, d_factor, p_factor, e_factor),
    save_dir=PWD / f'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/spanish',
    # verbose=False,
    custom_arpa_subs=custom_arpa_subs,
    verbose=True,
    custom_filename=custom_filename,
    device=device,
)

Updated 
   -1      0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16
 arpas     K    R   EH0   Y   IY1   Y   AA0   Y    S    ,   OW0    Y    IY1    Y    AA0    Y     S
d_factor  0.7   1    1   0.4  0.7  0.4   1   0.4   1    0    1    0.4   0.7   0.4    1    0.4    1
p_factor   0    0    0    0    1    0    0    0    0    0    0     0     1     0     0     0     0
e_factor   0    0   0.5   0   0.5   0   0.5   0    0    0   0.5    0    0.5    0    0.5    0     0
Duration pred: tensor([ 5.9971,  6.2804,  8.7144,  7.8999,  9.0675,  8.4928,  9.2692, 10.6716,
        10.3342, 20.5245, 12.5303,  9.2557,  8.4650,  9.1517,  9.3981, 11.4364,
        19.4551], device='cuda:0')
Pitch pred: tensor([ 0.9911,  1.0531,  0.6471,  0.3290,  0.2310, -0.4055, -0.8396, -0.4751,
        -0.2946, -0.6380, -0.4088,  0.2300,  0.6042, -0.2134, -0.9145, -0.6451,
        -0.4752], device='cuda:0')
Energy pred: tensor([-0.8592,  1.2776,  0.6716,  0.2274,  0.3890,  0.3921,  0.

In [10]:
from tqdm import tqdm
# Now run on entire dataset
save_dir = PWD / 'outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/CSS10/spanish'
transcript_file = (DATA_DIR / 'transcript_phones_espeak.txt').resolve()
with open(transcript_file) as f:
    for line in tqdm(f.read().splitlines()):
        custom_filename, phones = line.strip().split('|', maxsplit=1)
        if (save_dir / custom_filename).exists():
            continue
        spanish = phones.split()
        
        inputs = sas.gen_audio(
            spanish,
            save_dir=save_dir,
            verbose=False,
            custom_filename=custom_filename,
            # verbose=True,
            device=device,
        )
        # arpas, d_factor, e_factor = inputs


100%|██████████| 11016/11016 [42:22<00:00,  4.33it/s]


In [34]:
p = ' '.join(phonemize('die Arme umeinander legten und, Wange an Wange, ihr Blut aneinander pochen ließen.'))