In [1]:

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from pathlib import Path

import soundfile as sf
import torch
from prosody.utils.text import TextEffectProcessor
from prosody.utils.utils import get_p_mod_fns, print_table, clean_filename
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.token_id_converter import TokenIDConverter
from espnet2.bin.tts_inference import Text2Speech

%load_ext autoreload
PWD = %pwd
PWD = Path(PWD)
LJSPEECH_DIR = (PWD / '../egs2/ljspeech/tts1/').resolve()
device = 'cuda'

In [12]:
%autoreload 2

In [2]:
cwd = os.getcwd()
os.chdir(LJSPEECH_DIR)
pretrained_dir = LJSPEECH_DIR / "exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space"
pretrained_model_file = pretrained_dir / "train.total_count.ave_5best.pth"
pretrained_tts = Text2Speech.from_pretrained(
    train_config=pretrained_dir / "config_prosody.yaml",
    model_file=pretrained_model_file,
    device=device
)
pretrained_model = pretrained_tts.model
os.chdir(cwd)

It seems weight norm is not applied in the pretrained model but the current model uses it. To keep the compatibility, we remove the norm from the current model. This may cause unexpected behavior due to the parameter mismatch in finetuning. To avoid this issue, please change the following parameters in config to false:
 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249


In [3]:
arpa_tokenizer = PhonemeTokenizer(g2p_type='g2p_en_no_space')
id_converter = TokenIDConverter(pretrained_tts.train_args.token_list)
tep = TextEffectProcessor(tokens2ids_fn=id_converter.tokens2ids)
save_dir = PWD / 'outputs' / pretrained_dir.stem

import re
def save_wav(orig_text, model=pretrained_model, **kwargs):
    with torch.no_grad():
        suffix = ''
        if 'phone_ids' in kwargs:
            phone_ids = kwargs.pop('phone_ids')
            if 'suffix' in kwargs:
                suffix = kwargs.pop('suffix')
            output_dict = model.tts.inference(text=phone_ids, **kwargs)
        else:
            phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, _ = tep.get_inputs(orig_text, print_alignment=True)
            output_dict = model.tts.inference(
                text=phone_ids, d_factor=d_factor, p_factor=p_factor,
                e_factor=e_factor, d_split_factor=d_split_factor, verbose=True
            )
        wav = output_dict['wav']
        filename = f'{clean_filename(orig_text)}.{suffix}.wav'
        sf.write(save_dir / filename, wav.cpu().numpy(), 22050, "PCM_16")
    return output_dict

In [4]:
rise_fall = [-1.0, +1.5, 1.0, 0.0]
low = [-1]
rising = [-1, +1.5]
high = [+1.5]
falling = [+1.5, -1]
combine_fn = lambda x,y: x+y

In [16]:
orig_text = "THIS *is* ^ABso^LLLUUUTTTE_ly cr^aaaaaazy~~~!!"
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, _ = tep.get_inputs(orig_text, print_alignment=True)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor)
save_wav(
    orig_text, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor,
    d_split_factor=d_split_factor, verbose=True
)

     -1         0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17     18    19    20    21    22
grapheme_list  th    i    s    i    s    a    b    s    o    l    u     t     e     l     y     c     r      a     z     y     !     !
phoneme_list   DH   IH1   S   IH1   Z   AE2   B    S   AH0   L   UW1    T           L    IY0    K     R     EY1    Z    IY0    !     !
  low_tones                                                                         _
 high_tones                              ^                   ^                                               ^
  emphases      *    *    *    *    *    *    *              *    *     *     *
   longers                                                  ~~    ~~    ~~                                 ~~~~~        ~~~
      -1         0    1    2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17    18    19    20
   phonemes     DH   IH1   S   IH1   Z   AE2   B    S   A

{'wav': tensor([7.1454e-05, 4.3386e-04, 6.8417e-04,  ..., 2.2558e-04, 1.2258e-04,
         1.7337e-04], device='cuda:0'),
 'duration': tensor([ 7,  6,  9,  7,  9, 11,  8, 10,  6,  7,  7,  7,  7,  7,  7,  6,  6,  9,
          9, 13, 13, 13, 12, 12, 12, 11,  4], device='cuda:0')}

In [74]:
orig_text = "Suuuuuuuuure."
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, _ = tep.get_inputs(orig_text, print_alignment=True)
pitch_values = [[]] * len(phonemes)
pitch_values[1] = [-0.5, +1, -0.5]
combine_fn = lambda x,y: x+y
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True)

     -1         0      1       2    3    4    5    6    7    8    9    10
grapheme_list   s      u       r    e    .
phoneme_list   SH     UH1      R         .
  low_tones
 high_tones
  emphases
   longers          ~~~~~~~~
pitch_values   -1   -1|1.5|0   0
      -1         0        1        2    3
   phonemes     SH       UH1       R    .
  phone_ids     35       44        7   33
   d_factor      1        1        1    1
   p_factor      0        0        0    0
   e_factor      0        0        0    0
d_split_factor   1        5        1    1
 pitch_values        -0.5|1|-0.5
Duration pred: tensor([19.4962, 13.2584, 10.9335,  9.1969], device='cuda:0')
Pitch pred: tensor([ 0.8972,  0.1142, -0.8400, -0.9682], device='cuda:0')
Energy pred: tensor([ 0.1594,  0.8254, -0.3954, -1.0230], device='cuda:0')
Duration (new): tensor([19, 13, 13, 13, 13, 13, 11,  9], device='cuda:0')
Pitch (new): tensor([ 0.8972, -0.3858,  0.3642,  1.1142,  0.3642, -0.3858, -0.8400, -0.9682],
       device='cuda:0'

{'wav': tensor([-0.0205,  0.0137,  0.0520,  ...,  0.0007,  0.0008,  0.0007],
        device='cuda:0'),
 'duration': tensor([19, 13, 13, 13, 13, 13, 11,  9], device='cuda:0')}

In [5]:
long_phone_sentences = [
    "Suuuuuuure, I'm coming!", ('sure', rise_fall, 0),
    "This is a craaaaaaazy way to do it.", ('crazy', falling, 0),
    "Its too faaaaaaar.", ('far', rise_fall, 0),
    "Waaaaaaa, this is a big prize.", ('wa', rise_fall, 0),
    "That could be a lo~~~~~~ng way!", ('long', rising, 0),
    "Cooooooool, let's go.", ('cool', rise_fall, 0),
    "Then we will dieeeeeee.", ('die', falling, 0),
    "Yesterdays lunch was reaaaaaaally good.", ('really', rising, 0),
    "Nooooooo, i won't give it to you.", ('no', rise_fall, 0),
    "Will be fiiiiiiine.", ('fine', falling, 0),
]
sentences = long_phone_sentences[0::2]
word_tones_list = long_phone_sentences[1::2]

In [85]:
sentences

["Suuuuuuure, I'm coming!",
 'This is a craaaaaaazy way to do it.',
 'Its too faaaaaaar.',
 'Waaaaaaa, this is a big prize.',
 'That could be a lo~~~~~~ng way!',
 "Cooooooool, let's go.",
 'Then we will dieeeeeee.',
 'Yesterdays lunch was reaaaaaaally good.',
 "Nooooooo, i won't give it to you.",
 'Will be fiiiiiiine.']

In [6]:
from prosody.utils.audio import td_psola_stretch
def stretch(orig_text, output_dict, d_split_factor):
    clean_text = clean_filename(orig_text)
    filename = f'{clean_text}.orig.wav'
    out_filename = f'{clean_text}.psola.wav'
    durations = output_dict['duration'].cpu().numpy()
    phone_i = (d_split_factor.squeeze() != 1).nonzero().item()
    d_fac = d_split_factor.squeeze()[phone_i].item()
    td_psola_stretch(str(save_dir / filename), str(save_dir / out_filename), durations, phone_i, d_fac)

In [7]:
for orig_text, word_tones in zip(sentences, word_tones_list):
    phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=[word_tones], print_alignment=True)
    p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=None)
    print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
    output_dict = save_wav(orig_text, phone_ids=phone_ids, verbose=True, suffix='orig')
    stretch(orig_text, output_dict, d_split_factor)
    save_wav(orig_text, phone_ids=phone_ids, d_factor=d_split_factor, suffix='jets')
    save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')

                       -1       0    1    2    3    4    5    6    7    8    9    10    11    12
grapheme_list  s       u        r    e    ,    i    '    m    c    o    m    i    ng    !
phoneme_list   SH     UH1       R         ,   AY1        M    K   AH1   M   IH0   NG    !
  low_tones
 high_tones
  emphases
   longers           ~~~~~~
pitch_values   -1  -1|1.5|1|0   0
      -1         0       1        2    3    4    5    6    7    8    9    10    11
   phonemes     SH      UH1       R    ,   AY1   M    K   AH1   M   IH0   NG    !
  phone_ids     35       44       7   23   32   14   10   19   14   13    36    72
   d_factor      1       1        1    1    1    1    1    1    1    1    1     1
   p_factor      0       0        0    0    0    0    0    0    0    0    0     0
   e_factor      0       0        0    0    0    0    0    0    0    0    0     0
d_split_factor   1       4        1    1    1    1    1    1    1    1    1     1
 pitch_values   -1   -1|1.5|1|0   0
Duration pred:

In [9]:
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_split_factor, suffix='jets')


{'wav': tensor([0.0006, 0.0008, 0.0010,  ..., 0.0004, 0.0005, 0.0004], device='cuda:0'),
 'duration': tensor([ 6,  6,  6,  5,  5,  8,  9, 12,  9, 51,  9,  9], device='cuda:0')}

In [8]:
dailytalk = [
    "What are you working on?", '/home/perry/PycharmProjects/dailytalk/data/0/0_1_d0.wav',
    "Well, how does it look?", '/home/perry/PycharmProjects/dailytalk/data/1/0_0_d1.wav',
    "Is that Mister Edna Kent?", '/home/perry/PycharmProjects/dailytalk/data/2/6_0_d2.wav',
    "Did you bring some lunch with you?", '/home/perry/PycharmProjects/dailytalk/data/3/0_1_d3.wav',
    "Who taught you to put on make-up?", '/home/perry/PycharmProjects/dailytalk/data/4/2_0_d4.wav',
    "Excuse me, could you tell me where you have got that music book?", '/home/perry/PycharmProjects/dailytalk/data/5/0_0_d5.wav',
    "What do I need to take with me?", '/home/perry/PycharmProjects/dailytalk/data/6/3_1_d6.wav',
    "Can you come to my office this afternoon at three o clock?", '/home/perry/PycharmProjects/dailytalk/data/7/5_1_d7.wav',
    "Can I help you?", '/home/perry/PycharmProjects/dailytalk/data/8/0_0_d8.wav',
    "You mean the boy who felt carsick just now?", '/home/perry/PycharmProjects/dailytalk/data/9/7_0_d9.wav'
]

questions = dailytalk[0::2]
questions_gts = dailytalk[1::2]

In [19]:
import shutil
for question, wav_path in zip(questions, questions_gts):
    filename = '/home/perry/firebase_asru/public/questions/' + question.strip('?').replace(' ', '_') + '.dailytalk.wav'
    shutil.copyfile(wav_path, filename)

In [9]:
orig_text = questions[0]
word_tones = [
    ('what', rising, 0),
    ('on', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0     1      2    3    4    5    6    7    8    9    10    11     12     13    14
   phonemes      W    AH1     T   AA1   R    Y   UW1   W   ER1   K   IH0    NG    AA1     N     ?
  phone_ids     16     19     4   24    7   41   28   16   38   10    13    36     24     3     70
   d_factor      1     1      1    1    1    1    1    1    1    1    1     1      1      1     1
   p_factor      0     0      0    0    0    0    0    0    0    0    0     0      0      0     0
   e_factor      0     0      0    0    0    0    0    0    0    0    0     0      0      0     0
d_split_factor   1     1      1    1    1    1    1    1    1    1    1     1      1      1     1
 pitch_values   -1   -1|1.5  1.5                                                 -1|1.5  1.5
Duration pred: tensor([ 4.9279,  5.1688,  3.6552,  4.6222,  8.7406,  6.2302,  8.1475,  7.8196,
         7.6934,  6.9987,  6.4065,  9.5552, 13.5481,  9.9024,  8.2748],
       device='cuda:0')
Pitch pred: tensor([ 0.2061,

{'wav': tensor([ 4.6483e-04,  5.3581e-04,  5.2439e-04,  ..., -9.3417e-05,
         -4.3272e-04, -3.9974e-04], device='cuda:0'),
 'duration': tensor([ 5,  5,  4,  5,  9,  6,  8,  8,  8,  7,  6, 10, 14, 10,  8],
        device='cuda:0')}

In [10]:
orig_text = questions[1]
word_tones = [
    ('how', rising, 0),
    ('look', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2    3    4     5      6    7    8    9    10    11     12     13    14
   phonemes      W   EH1   L    ,   HH    AW1     D   AH1   Z   IH1   T     L     UH1     K     ?
  phone_ids     16   15    8   23   26     42     5   19   11   12    4     8      44     10    70
   d_factor      1    1    1    1    1     1      1    1    1    1    1     1      1      1     1
   p_factor      0    0    0    0    0     0      0    0    0    0    0     0      0      0     0
   e_factor      0    0    0    0    0     0      0    0    0    0    0     0      0      0     0
d_split_factor   1    1    1    1    1     1      1    1    1    1    1     1      1      1     1
 pitch_values                       -1   -1|1.5                             -1   -1|1.5  1.5
Duration pred: tensor([ 9.1000,  8.4847, 11.0818, 18.9227,  8.5377, 11.9354,  8.1488,  7.0069,
         6.5353,  6.2313,  8.1652,  8.3556,  9.9118, 11.5257,  8.7501],
       device='cuda:0')
Pitch pred: tensor([ 0.3497,

{'wav': tensor([ 0.0003,  0.0003,  0.0005,  ..., -0.0004, -0.0004, -0.0004],
        device='cuda:0'),
 'duration': tensor([ 9,  8, 11, 19,  9, 12,  8,  7,  7,  6,  8,  8, 10, 12,  9],
        device='cuda:0')}

In [11]:
orig_text = questions[2]
word_tones = [
    ('is', low, 0),
    ('that', rising, 0),
    ('kent', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2     3      4    5    6    7    8    9    10    11    12    13    14     15     16    17    18
   phonemes     IH1   Z   DH    AE1     T    M   IH1   S    T   ER0  EH1    D     N    AH0    K     EH1     N     T     ?
  phone_ids     12   11    9     18     4   14   12    6    4   21    15    5     3     2     10     15     3     4     70
   d_factor      1    1    1     1      1    1    1    1    1    1    1     1     1     1     1      1      1     1     1
   p_factor      0    0    0     0      0    0    0    0    0    0    0     0     0     0     0      0      0     0     0
   e_factor      0    0    0     0      0    0    0    0    0    0    0     0     0     0     0      0      0     0     0
d_split_factor   1    1    1     1      1    1    1    1    1    1    1     1     1     1     1      1      1     1     1
 pitch_values   -1   -1   -1   -1|1.5  1.5                                                    -1   -1|1.5  1.5   1.5
Duration pred: tensor([9.27

{'wav': tensor([ 1.7980e-03,  1.7926e-03,  1.8409e-03,  ..., -2.3648e-04,
         -1.4819e-04, -2.7582e-05], device='cuda:0'),
 'duration': tensor([9, 8, 6, 9, 5, 5, 5, 6, 9, 9, 9, 7, 6, 7, 8, 9, 9, 7, 9],
        device='cuda:0')}

In [12]:
orig_text = questions[3]
word_tones = [
    ('did', low, 0),
    ('you', rising, 0),
    ('you', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2    3     4      5    6    7    8    9    10    11    12    13    14    15    16    17    18    19     20     21
   phonemes      D   IH1   D    Y    UW1     B    R   IH1  NG    S   AH1    M     L    AH1    N     CH    W    IH1    DH    Y     UW1     ?
  phone_ids      5   12    5   41     28    25    7   12   36    6    19    14    8     19    3     39    16    12    9     41     28     70
   d_factor      1    1    1    1     1      1    1    1    1    1    1     1     1     1     1     1     1     1     1     1      1      1
   p_factor      0    0    0    0     0      0    0    0    0    0    0     0     0     0     0     0     0     0     0     0      0      0
   e_factor      0    0    0    0     0      0    0    0    0    0    0     0     0     0     0     0     0     0     0     0      0      0
d_split_factor   1    1    1    1     1      1    1    1    1    1    1     1     1     1     1     1     1     1     1     1      1      1
 pitch_values   -1

{'wav': tensor([-1.2979e-04,  9.9986e-05,  2.6319e-04,  ..., -4.0486e-04,
         -4.8969e-04, -5.0143e-04], device='cuda:0'),
 'duration': tensor([ 7,  6,  4,  6,  7,  6,  7,  6,  9,  9,  6,  8,  6,  7,  7,  7,  6,  7,
          9, 11, 11, 10], device='cuda:0')}

In [13]:
orig_text = questions[4]
word_tones = [
    ('who', rising, 0),
    ('make-up', rising, 1),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0     1      2    3    4    5    6    7    8    9    10    11    12    13    14    15    16     17     18    19
   phonemes     HH    UW1     T   AO1   T    Y   UW1   T   UW1   P   UH1    T    AA1    N     M    EY1    K     AH2     P     ?
  phone_ids     26     28     4   30    4   41   28    4   28   17    44    4     24    3     14    31    10     62     17    70
   d_factor      1     1      1    1    1    1    1    1    1    1    1     1     1     1     1     1     1      1      1     1
   p_factor      0     0      0    0    0    0    0    0    0    0    0     0     0     0     0     0     0      0      0     0
   e_factor      0     0      0    0    0    0    0    0    0    0    0     0     0     0     0     0     0      0      0     0
d_split_factor   1     1      1    1    1    1    1    1    1    1    1     1     1     1     1     1     1      1      1     1
 pitch_values   -1   -1|1.5                                                                           

{'wav': tensor([-1.4456e-04, -3.0431e-04, -1.2317e-05,  ...,  9.3130e-04,
          9.9696e-04,  1.0555e-03], device='cuda:0'),
 'duration': tensor([ 4,  8,  8, 11,  8,  6, 10,  7,  7,  7,  6,  7,  9,  7,  6,  8,  8,  9,
          7,  6], device='cuda:0')}

In [14]:
orig_text = questions[5]
word_tones = [
    ('could', low, 0),
    ('you', rising, 0),
    ('book', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2    3    4    5    6    7    8    9    10    11    12    13     14     15    16    17    18    19    20    21    22    23    24    25    26    27    28    29    30    31    32    33    34    35    36    37    38    39    40     41     42    43
   phonemes     IH0   K    S    K    Y   UW1   S    M   IY1   ,    K    UH1    D     Y     UW1     T    EH1    L     M    IY1    W    EH1    R     Y    UW1    HH   AE1    V     G    AA1    T     DH   AE1    T     M     Y    UW1    Z    IH0    K     B     UH1     K     ?
  phone_ids     13   10    6   10   41   28    6   14   27   23    10    44    5     41     28     4     15    8     14    27    16    15    7     41    28    26    18    20    37    24    4     9     18    4     14    41    28    11    13    10    25     44     10    70
   d_factor      1    1    1    1    1    1    1    1    1    1    1     1     1     1      1      1     1     1     1     1     1     1     1     1     1     1     1     1     1     1 

{'wav': tensor([ 0.0018,  0.0007, -0.0002,  ...,  0.0003,  0.0005,  0.0006],
        device='cuda:0'),
 'duration': tensor([ 8,  7,  9,  6,  7,  9, 11,  9, 13, 11,  6,  5,  4,  5,  7,  7,  7,  7,
          8, 15,  7,  8,  7,  7,  6,  5,  6,  7,  7,  7,  7,  6,  8,  7,  7,  6,
          7,  6,  6,  7,  8, 11, 11,  9], device='cuda:0')}

In [15]:
orig_text = questions[6]
word_tones = [
    ('what', rising, 0),
    ('me', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0     1      2    3    4    5    6    7    8    9    10    11    12    13    14    15    16    17     18     19
   phonemes      W    AH1     T    D   UW1  AY1   N   IY1   D    T   UW1    T    EY1    K     W    IH1    DH    M     IY1     ?
  phone_ids     16     19     4    5   28   32    3   27    5    4    28    4     31    10    16    12    9     14     27     70
   d_factor      1     1      1    1    1    1    1    1    1    1    1     1     1     1     1     1     1     1      1      1
   p_factor      0     0      0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0      0      0
   e_factor      0     0      0    0    0    0    0    0    0    0    0     0     0     0     0     0     0     0      0      0
d_split_factor   1     1      1    1    1    1    1    1    1    1    1     1     1     1     1     1     1     1      1      1
 pitch_values   -1   -1|1.5  1.5                                                                      

{'wav': tensor([ 0.0002,  0.0003,  0.0003,  ..., -0.0003, -0.0002, -0.0002],
        device='cuda:0'),
 'duration': tensor([ 6,  6,  8,  8, 11, 13,  9, 10,  6,  6,  7,  9,  8,  6,  5,  7,  8, 10,
         11,  9], device='cuda:0')}

In [16]:
orig_text = questions[7]
word_tones = [
    ('can', low, 0),
    ('you', rising, 0),
    ('clock', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2    3     4      5    6    7    8    9    10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25    26    27    28    29    30    31    32    33     34     35    36
   phonemes      K   AE1   N    Y    UW1     K   AH1   M    T   UW1   M    AY1   AO1    F    AH0    S     DH   IH1    S    AE2    F     T    ER0    N    UW1    N    AE1    T     TH    R    IY1   OW1    K     L     AA1     K     ?
  phone_ids     10   18    3   41     28    10   19   14    4   28    14    32    30    22    2     6     9     12    6     50    22    4     21    3     28    3     18    4     43    7     27    34    10    8      24     10    70
   d_factor      1    1    1    1     1      1    1    1    1    1    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1      1      1     1
   p_factor      0    0    0    0     0      0    0    0    0    0    0     0 

{'wav': tensor([-0.0091, -0.0027, -0.0032,  ...,  0.0005,  0.0003,  0.0004],
        device='cuda:0'),
 'duration': tensor([ 7,  9,  6,  5,  8,  7,  8,  9,  8,  6,  9, 11, 12,  9,  9, 17, 14,  7,
          9,  9,  8,  7,  5,  8,  8,  6,  8,  8,  7,  9, 13, 12,  9,  9, 12, 14,
          9], device='cuda:0')}

In [17]:
orig_text = questions[8]
word_tones = [
    ('can', low, 0),
    ('i', rising, 0),
    ('you', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2     3      4    5    6    7    8     9      10
   phonemes      K   AE1   N    AY1    HH   EH1   L    P    Y    UW1     ?
  phone_ids     10   18    3     32    26   15    8   17   41     28     70
   d_factor      1    1    1     1      1    1    1    1    1     1      1
   p_factor      0    0    0     0      0    0    0    0    0     0      0
   e_factor      0    0    0     0      0    0    0    0    0     0      0
d_split_factor   1    1    1     1      1    1    1    1    1     1      1
 pitch_values   -1   -1   -1   -1|1.5                      -1   -1|1.5
Duration pred: tensor([ 7.5181,  5.3018,  7.4202,  9.8081,  8.5703,  7.6117,  8.8028, 10.9280,
        12.1972,  9.4722,  9.8217], device='cuda:0')
Pitch pred: tensor([ 0.6783,  0.4176,  0.1487, -0.0547, -0.0795, -0.1784, -0.2895, -0.2012,
        -0.3101, -0.2473, -0.1040], device='cuda:0')
Energy pred: tensor([-0.5433,  1.7549,  1.2119,  0.8535, -0.4994,  1.2644, -0.5762, -0.8564,
         0.3001,

{'wav': tensor([8.0668e-04, 6.4678e-04, 5.7723e-04,  ..., 5.5897e-05, 3.0706e-04,
         3.1662e-04], device='cuda:0'),
 'duration': tensor([ 8,  5,  7, 10,  9,  8,  9, 11, 12,  9, 10], device='cuda:0')}

In [18]:
orig_text = questions[9]
word_tones = [
    ('you', low, 0),
    ('mean', rising, 0),
    ('now', rising, 0),
]
phonemes, phone_ids, d_factor, p_factor, e_factor, d_split_factor, pitch_values = tep.get_inputs(orig_text, word_tones=word_tones)
p_mod_fns = get_p_mod_fns(pitch_values, combine_fns=combine_fn)
print_table(phonemes=phonemes, phone_ids=phone_ids, d_factor=d_factor, p_factor=p_factor, e_factor=e_factor, d_split_factor=d_split_factor, pitch_values=pitch_values)
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=p_mod_fns, verbose=True, suffix='present')
save_wav(orig_text, phone_ids=phone_ids, d_factor=d_factor, d_split_factor=d_split_factor, p_mod_fns=None, suffix='jets')

      -1         0    1    2     3      4    5    6    7    8    9    10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25     26     27
   phonemes      Y   UW1   M    IY1     N   DH   AH0   B   OY1  HH   UW1    F    EH1    L     T     K    AA1    R     S    IH0    K     JH   AH1    S     T     N     AW1     ?
  phone_ids     41   28   14     27     3    9    2   25   55   26    28    22    15    8     4     10    24    7     6     13    10    40    19    6     4     3      42     70
   d_factor      1    1    1     1      1    1    1    1    1    1    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1      1      1
   p_factor      0    0    0     0      0    0    0    0    0    0    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0      0      0
   e_factor      0    0    0     0      0    0    0    0    0    0    0     0     0     0     0     0     0     0     

{'wav': tensor([0.0022, 0.0021, 0.0024,  ..., 0.0012, 0.0011, 0.0010], device='cuda:0'),
 'duration': tensor([ 9,  7,  6, 11, 11,  6,  8, 18, 12, 13, 11,  9,  7,  6, 10,  6,  7,  8,
          8,  6, 12,  7,  7,  8,  9, 16, 13, 12], device='cuda:0')}

In [None]:
import pandas as pd
df = pd.read_csv('/home/perry/PycharmProjects/present/prosody/paper/data.csv')
df.columns

longdur_cols = df.columns[1:31]
psola_cols = [x for x in longdur_cols if x.startswith('psola')]
present_longdur_cols = [x for x in longdur_cols if x.startswith('present')]
jets_longdur_cols = [x for x in longdur_cols if x.startswith('jets')]

psola_mos = df[psola_cols].mean().mean()
jets_longdur_mos = df[jets_longdur_cols].mean().mean()
present_longdur_mos = df[present_longdur_cols].mean().mean()

qn_cols = df.columns[31:61]
dailytalk_cols = [x for x in qn_cols if x.startswith('dailytalk')]
jets_qn_cols = [x for x in qn_cols if x.startswith('jets')]
present_qn_cols = [x for x in qn_cols if x.startswith('present')]

dailytalk_mos = df[dailytalk_cols].mean().mean()
jets_qn_mos = df[jets_qn_cols].mean().mean()
present_qn_mos = df[present_qn_cols].mean().mean()

print(psola_mos, jets_longdur_mos, present_longdur_mos)
print(dailytalk_mos, jets_qn_mos, present_qn_mos)

transfer_cols = [x for x in df.columns if x.startswith('Prosody')]
print(df[transfer_cols].mean())