<a href="https://colab.research.google.com/github/kurianbenoy/MTTS/blob/master/nbs/Transliteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text-to-Speech with Mozilla Tacotron+WaveRNN

This is an English female voice TTS demo using open source projects [mozilla/TTS](https://github.com/mozilla/TTS/) and [erogol/WaveRNN](https://github.com/erogol/WaveRNN).

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).

## Install Mozilla TTS and WaveRNN

In [1]:
import os
import time
from os.path import exists, join, basename, splitext

git_repo_url = 'https://github.com/mozilla/TTS.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  !git clone -q {git_repo_url}
  !cd {project_name} && git checkout Tacotron2-iter-260K-824c091
  !pip install -q gdown lws librosa Unidecode==0.4.20 tensorboardX git+git://github.com/bootphon/phonemizer@master localimport
  !apt-get install -y espeak
git_repo_url = 'https://github.com/erogol/WaveRNN.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
  !git clone -q {git_repo_url}
  !cd {project_name} && git checkout 8a1c152 && pip install -q -r requirements.txt

  
import sys
sys.path.append('TTS')
sys.path.append('WaveRNN')
from localimport import localimport
  
from IPython.display import Audio, display

Branch 'Tacotron2-iter-260K-824c091' set up to track remote branch 'Tacotron2-iter-260K-824c091' from 'origin'.
Switched to a new branch 'Tacotron2-iter-260K-824c091'
[K     |████████████████████████████████| 153kB 2.8MB/s 
[K     |████████████████████████████████| 235kB 8.8MB/s 
[K     |████████████████████████████████| 204kB 7.1MB/s 
[K     |████████████████████████████████| 194kB 11.9MB/s 
[K     |████████████████████████████████| 51kB 5.5MB/s 
[?25h  Building wheel for lws (setup.py) ... [?25l[?25hdone
  Building wheel for localimport (setup.py) ... [?25l[?25hdone
  Building wheel for phonemizer (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  espeak-data libespeak1 libportaudio2 libsonic0
The following NEW 

## Download pretrained models

In [2]:
# WaveRNN
!mkdir -p wavernn_models tts_models
wavernn_pretrained_model = 'wavernn_models/checkpoint_433000.pth.tar'
if not exists(wavernn_pretrained_model):
  !gdown -O {wavernn_pretrained_model} https://drive.google.com/uc?id=12GRFk5mcTDXqAdO5mR81E-DpTk8v2YS9
wavernn_pretrained_model_config = 'wavernn_models/config.json'
if not exists(wavernn_pretrained_model_config):
  !gdown -O {wavernn_pretrained_model_config} https://drive.google.com/uc?id=1kiAGjq83wM3POG736GoyWOOcqwXhBulv
    
# TTS
tts_pretrained_model = 'tts_models/checkpoint_261000.pth.tar'
if not exists(tts_pretrained_model):
  !gdown -O {tts_pretrained_model} https://drive.google.com/uc?id=1otOqpixEsHf7SbOZIcttv3O7pG0EadDx
tts_pretrained_model_config = 'tts_models/config.json'
if not exists(tts_pretrained_model_config):
  !gdown -O {tts_pretrained_model_config} https://drive.google.com/uc?id=1IJaGo0BdMQjbnCcOL4fPOieOEWMOsXE-

Downloading...
From: https://drive.google.com/uc?id=12GRFk5mcTDXqAdO5mR81E-DpTk8v2YS9
To: /content/wavernn_models/checkpoint_433000.pth.tar
50.9MB [00:01, 29.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kiAGjq83wM3POG736GoyWOOcqwXhBulv
To: /content/wavernn_models/config.json
100% 2.31k/2.31k [00:00<00:00, 4.00MB/s]
Downloading...
From: https://drive.google.com/uc?id=1otOqpixEsHf7SbOZIcttv3O7pG0EadDx
To: /content/tts_models/checkpoint_261000.pth.tar
338MB [00:07, 42.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IJaGo0BdMQjbnCcOL4fPOieOEWMOsXE-
To: /content/tts_models/config.json
100% 5.75k/5.75k [00:00<00:00, 5.00MB/s]


## Initialize models

In [3]:
#
# this code is copied from: https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb
#

import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt
import IPython

%pylab inline
rcParams["figure.figsize"] = (16,5)

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config, setup_model
from TTS.utils.text import text_to_sequence
from TTS.utils.synthesis import synthesis
from TTS.utils.visual import visualize

def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=True, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    if not use_gl:
        waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)

    print(" >  Run-time: {}".format(time.time() - t_1))
    if figures:                                                                                                         
        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    #os.makedirs(OUT_FOLDER, exist_ok=True)
    #file_name = text.replace(" ", "_").replace(".","") + ".wav"
    #out_path = os.path.join(OUT_FOLDER, file_name)
    #ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform
  
use_cuda = True
batched_wavernn = True

# initialize TTS
CONFIG = load_config(tts_pretrained_model_config)
from TTS.utils.text.symbols import symbols, phonemes
# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, CONFIG)
# load the audio processor
ap = AudioProcessor(**CONFIG.audio)         
# load model state
if use_cuda:
    cp = torch.load(tts_pretrained_model)
else:
    cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
model.decoder.max_decoder_steps = 2000

# initialize WaveRNN
VOCODER_CONFIG = load_config(wavernn_pretrained_model_config)
with localimport('/content/WaveRNN') as _importer:
  from models.wavernn import Model
bits = 10

wavernn = Model(
        rnn_dims=512,
        fc_dims=512,
        mode="mold",
        pad=2,
        upsample_factors=VOCODER_CONFIG.upsample_factors,  # set this depending on dataset
        feat_dims=VOCODER_CONFIG.audio["num_mels"],
        compute_dims=128,
        res_out_dims=128,
        res_blocks=10,
        hop_length=ap.hop_length,
        sample_rate=ap.sample_rate,
    ).cuda()
check = torch.load(wavernn_pretrained_model)
wavernn.load_state_dict(check['model'])
if use_cuda:
    wavernn.cuda()
wavernn.eval()
print(check['step'])

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > bits:None
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:12.5
 | > frame_length_ms:50
 | > ref_level_db:20
 | > num_freq:1025
 | > power:1.5
 | > preemphasis:0.98
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:False
 | > mel_fmin:0.0
 | > mel_fmax:8000.0
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > n_fft:2048
 | > hop_length:275
 | > win_length:1102
261000
433000


## Sentence to synthesize

## Transliterated text

Thanks to subins https://demos.subinsb.com/indicen/

In [0]:
text1 = "vandi idukki etthii"

In [21]:
%time
align, spec, stop_tokens, wav = tts(model, text1, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
48000/48400 -- batch_size: 4 -- gen_rate: 2.4 kHz -- x_realtime: 0.1   >  Run-time: 22.556684970855713


In [0]:
text2 = "njaan innu oru vandi kayari"

In [17]:
%time
align, spec, stop_tokens, wav = tts(model, text2, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs
60000/60500 -- batch_size: 5 -- gen_rate: 3.0 kHz -- x_realtime: 0.1   >  Run-time: 23.55038356781006


In [0]:
text3 = "athil niraye aalukal undaayirunnu"

In [19]:
%time
align, spec, stop_tokens, wav = tts(model, text3, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.63 µs
60000/60500 -- batch_size: 5 -- gen_rate: 3.1 kHz -- x_realtime: 0.1   >  Run-time: 23.613027334213257


In [0]:
text4 = "buddhiyaanu saare avante meyin"

In [23]:
%time
align, spec, stop_tokens, wav = tts(model, text4, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
60000/60500 -- batch_size: 5 -- gen_rate: 2.9 kHz -- x_realtime: 0.1   >  Run-time: 24.14083480834961


In [0]:
text5 = "annaan mootthaalum maram kettam marakkumo?"

In [25]:
%time
align, spec, stop_tokens, wav = tts(model, text5, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs
72000/72600 -- batch_size: 6 -- gen_rate: 3.5 kHz -- x_realtime: 0.2   >  Run-time: 24.96942400932312


In [0]:
text6 = "kayyil kaashundo?"

In [27]:
%time
align, spec, stop_tokens, wav = tts(model, text5, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs
84000/84700 -- batch_size: 7 -- gen_rate: 4.1 kHz -- x_realtime: 0.2   >  Run-time: 25.174461841583252


In [0]:
text7 = "annaan kunjum thannaalaayathu."

In [29]:
%time
align, spec, stop_tokens, wav = tts(model, text5, CONFIG, use_cuda, ap, speaker_id=1, use_gl=False, figures=False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs
72000/72600 -- batch_size: 6 -- gen_rate: 3.6 kHz -- x_realtime: 0.2   >  Run-time: 24.406561374664307
