In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import sys
import pickle
sys.path.append('../../')
print(sys.path)
import IPython.display as ipd
import numpy as np
from tools.libaudio.feature import melspectrogram
from tools.libaudio.display import show_spec, show_mel
from mlutils.utils import plot, plots, to_onehot
from models.mlmodeldic import best_model_path_and_settings
import torch
from tts.Alternative.neuravoice import CharToMel
from tts.Alternative.train import train_char2mel
from tts.Alternative.dataset import NeuraVoiceDataset
from torch.utils.data import DataLoader
import time

['/var/Sources/experiments/core/tts/Alternative', '/home/kaz/anaconda3/envs/core/lib/python37.zip', '/home/kaz/anaconda3/envs/core/lib/python3.7', '/home/kaz/anaconda3/envs/core/lib/python3.7/lib-dynload', '', '/home/kaz/.local/lib/python3.7/site-packages', '/home/kaz/anaconda3/envs/core/lib/python3.7/site-packages', '/home/kaz/anaconda3/envs/core/lib/python3.7/site-packages/Mako-1.0.7-py3.7.egg', '/home/kaz/anaconda3/envs/core/lib/python3.7/site-packages/IPython/extensions', '/home/kaz/.ipython', '../../']


####  Sequence Generator with location-based Attention

```
x: input sequence
y: mel spectrum sequence
U: char length
c: char sequence
T: timestep length of input x
wt: window vector into c at timestep t
φ(t, u): window weight of cu at timestep t
αt: parameters control the importance of the window within the mixture
βt: parameters control the width of the window
κt: parameters control the location of the window
(eq.46): discrete convolution with a mixture of K Gaussian function

xt ∈ R×R×{0,1}

h1t = H(Wih1 xt + Wh1h1 h1t-1 + Wwh1 wt-1 + b1h)

(αhat_t, βhat_t, khat_t) = Wh1p ht^1 + bp  # output of the first hidden layer

αt = exp(αhat_t)  # importance of the window
βt = exp(βhat_t)  # width of the window
κt = κt-1 + exp(κhat_t)  # location of the window (how far to slide each window)

φ(t, u) = Σk=1->K αkt*exp(-βkt(κkt-u)^2)  # mixture of K Gaussian


cf. 
normal distribution
N(X|μ,σ2) = 1/(2πσ2)^1/2 exp{-1/2σ2(x-μ)2}

mixture gaussians
p(x) = Σk=1->K πk N(X|μk,Σk)  # where πk:mixing coefficient, μk:mean, Σk:covariance, N(X|μk,Σk):mixture component
p(x) = Σk=1->K πk (1/(2π)^D/2 1/|Σ|^1/2) exp{-1/2(x-μ)^T Σ^-1(x-μ)}  # where Σ:DxD dim covariance matrix, |Σ|:det Σ

wt = Σu=1->U φ(t, u)*cu  # the soft window into c at timestep t

hnt = H(Wihn xt + Whn-1hn hnt-1 + Whnhn hnt-1 + Wwhn wt + bnh)

yˆtˆ = (eˆt,{wˆj_t,μˆJ_t,σˆj_t,ρˆj_t}^M_j=1) = by + Σn=1->N Whny hnt
yt = Y(yˆt)

et = 1 / (1 + exp(eˆt)) ⇒ et∈(0,1) : stroke probability
πtj = exp(πtj) / (Σj't=1->M exp(πtj)) ⇒πtj∈(0,1), Σπtj=1 : mixture weights
μjt = μˆjt ⇒ μjt∈R : means
σtj = exp(σˆtj) ⇒ σˆtj > 0 : std
pjt = tanh(pˆtj) ⇒ pˆtj∈(0,1) : correlations

Pr(x|c) = ∏t=1->T Pr(xt+1|yt)
L(x) = -log Pr(x|c)
```

In [2]:
# ver.1 hidden 256 out 256
# ver.2 hidden 256 out 256 
# ver.3 hidden 512 out 512, trainable init param
# ver.4 hidden 512 out 512, trainable init param, batchnorm, relu
# ver.5 hidden 512 out 512, trainable init param, batchnorm
# ver.6 hidden 512 out 512, trainable init param, no encoder
# ver.7 K=10, hidden 512 out 512, trainable init param, no encoder, use full gru
# ver.9 fixed kappa_t_1 <- kappa_t feedback loop

In [3]:
model_version = 9
batch_size = 16
hidden_size = 512
out_size = 512
lr = 0.0001
model_name = f'char2mel{hidden_size}_{out_size}_{batch_size}_ver{model_version}'
cuda = 2

In [4]:
dataset = NeuraVoiceDataset(batch_size=batch_size)
loader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.char_to_mel, drop_last=False, num_workers=1)

In [5]:
model_path, settings = best_model_path_and_settings(model_name, 'loss_ave', is_lower_better=True)
model_path, settings

(None, None)

In [6]:
device = f'cuda:{cuda}'

In [7]:
use_best_model = False

In [8]:
if model_path and use_best_model: model = CharToMel.init_from_settings(settings, model_path, device=device, **{'version': model_version})
else:                             model = CharToMel(encode_type='onehot', K=10, hidden_size=512, out_size=512, version=model_version, device=device)

model version 9
Trainable Parameters: 5.600 million


In [None]:
losses, loss_aves, model = train_char2mel(model, loader, n_epoch=1000, model_name=model_name, device=device, lr=lr, verbose=False)

epoch 0/99 iter: 1584/7614 total_iter: 100-- loss ave: 10.7123 loss: 7.97 -- elapse: 31m 26s speed 0.1 steps/sec
epoch 0/99 iter: 3184/7614 total_iter: 200-- loss ave: 8.3348 loss: 4.10 -- elapse: 50m 19s speed 0.1 steps/sec
epoch 0/99 iter: 4784/7614 total_iter: 300-- loss ave: 6.6716 loss: 3.46 -- elapse: 58m 42s speed 0.1 steps/sec
epoch 0/99 iter: 6384/7614 total_iter: 400-- loss ave: 5.6017 loss: 2.02 -- elapse: 1h 6m 58s speed 0.1 steps/sec
epoch 1/99 iter: 368/7614 total_iter: 500-- loss ave: 4.8374 loss: 2.14 -- elapse: 1h 15m 30s speed 0.1 steps/sec
epoch 1/99 iter: 1968/7614 total_iter: 600-- loss ave: 4.2601 loss: 1.67 -- elapse: 1h 23m 36s speed 0.1 steps/sec
epoch 1/99 iter: 3568/7614 total_iter: 700-- loss ave: 3.7986 loss: 0.62 -- elapse: 1h 32m 15s speed 0.1 steps/sec
epoch 1/99 iter: 5168/7614 total_iter: 800-- loss ave: 3.4377 loss: 0.61 -- elapse: 1h 40m 18s speed 0.1 steps/sec
epoch 1/99 iter: 6768/7614 total_iter: 900-- loss ave: 3.1494 loss: 0.73 -- elapse: 1h 48m

In [None]:
#plot(loss_aves)

In [None]:
#plots([losses, loss_aves], labels=['losses', 'loss_aves'])